;*****************************************************************************
;* Copyright (C) 2013-2017 MulticoreWare, Inc
;*
;* Authors: Nabajit Deka <nabajit@multicorewareinc.com>
;*          Min Chen <chenm003@163.com> <min.chen@multicorewareinc.com>
;*          Li Cao <li@multicorewareinc.com>
;*          Praveen Kumar Tiwari <Praveen@multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at license @ x265.com.
;*****************************************************************************/

;TO-DO : Further optimize the routines.

%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 64

tab_dct32:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4, -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90, -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38, -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61, 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57, -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78, -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43, -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90, -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
                dw  9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
                dw  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4
tab_dct16:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9

dct16_shuf_AVX512:  dq 0, 1, 8, 9, 4, 5, 12, 13
dct16_shuf1_AVX512: dq 2, 3, 10, 11, 6, 7, 14, 15
dct16_shuf3_AVX512: dq 0, 1, 4, 5, 8, 9, 12, 13
dct16_shuf4_AVX512: dq 2, 3, 6, 7, 10, 11, 14, 15
dct16_shuf2_AVX512: dd 0, 4, 8, 12, 2, 6, 10, 14, 16, 20, 24, 28, 18, 22, 26, 30

dct8_shuf5_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
dct8_shuf6_AVX512: dq 0, 2, 4, 6, 1, 3, 5, 7
dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
dct16_shuf7_AVX512: dd 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
dct16_shuf9_AVX512: dd 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15

dct32_shuf_AVX512:  dd 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20 , 21, 24, 25, 28, 29
dct32_shuf4_AVX512: times 2 dd 0, 4, 8, 12, 0, 4, 8, 12
dct32_shuf5_AVX512: dd 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0
dct32_shuf6_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, 0, 0, 0, 0
dct32_shuf7_AVX512: dd 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1
dct32_shuf8_AVX512: dd -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
dct16_shuf5_AVX512: dw 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
dct16_shuf6_AVX512: dw 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
dct16_shuf8_AVX512: dw 20, 0, 4, 2, 28, 8, 6, 10, 22, 16, 12, 18, 30, 24, 14, 26

dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30
dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28
dct32_shuf1_AVX512: dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
dct32_shuf2_AVX512: dw 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23, 15, 14, 13, 12, 11, 10, 9, 8, 31, 30, 29, 28, 27, 26, 25, 24
dct32_shuf3_AVX512: times 2 dw 0, 8, 16, 24, 2, 10, 18, 26

dct8_shuf:         times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9
dct8_shuf_AVX512:  times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11

tab_dct8:       dw 64, 64, 64, 64, 64, 64, 64, 64
                dw 89, 75, 50, 18, -18, -50, -75, -89
                dw 83, 36, -36, -83, -83, -36, 36, 83
                dw 75, -18, -89, -50, 50, 89, 18, -75
                dw 64, -64, -64, 64, 64, -64, -64, 64
                dw 50, -89, 18, 75, -75, -18, 89, -50
                dw 36, -83, 83, -36, -36, 83, -83, 36
                dw 18, -50, 75, -89, 89, -75, 50, -18

tab_dct8_avx512: dw 64, 64, 64, 64, 89, 75, 50, 18
                 dw 83, 36, -36, -83, 75, -18, -89, -50
                 dw 64, -64, -64, 64, 50, -89, 18, 75
                 dw 36, -83, 83, -36, 18, -50, 75, -89

tab_dct16_1:    dw 64, 64, 64, 64, 64, 64, 64, 64
                dw 90, 87, 80, 70, 57, 43, 25,  9
                dw 89, 75, 50, 18, -18, -50, -75, -89
                dw 87, 57,  9, -43, -80, -90, -70, -25
                dw 83, 36, -36, -83, -83, -36, 36, 83
                dw 80,  9, -70, -87, -25, 57, 90, 43
                dw 75, -18, -89, -50, 50, 89, 18, -75
                dw 70, -43, -87,  9, 90, 25, -80, -57
                dw 64, -64, -64, 64, 64, -64, -64, 64
                dw 57, -80, -25, 90, -9, -87, 43, 70
                dw 50, -89, 18, 75, -75, -18, 89, -50
                dw 43, -90, 57, 25, -87, 70,  9, -80
                dw 36, -83, 83, -36, -36, 83, -83, 36
                dw 25, -70, 90, -80, 43,  9, -57, 87
                dw 18, -50, 75, -89, 89, -75, 50, -18
                dw  9, -25, 43, -57, 70, -80, 87, -90

tab_dct16_2:    dw 64, 64, 64, 64, 64, 64, 64, 64
                dw -9, -25, -43, -57, -70, -80, -87, -90
                dw -89, -75, -50, -18, 18, 50, 75, 89
                dw 25, 70, 90, 80, 43, -9, -57, -87
                dw 83, 36, -36, -83, -83, -36, 36, 83
                dw -43, -90, -57, 25, 87, 70, -9, -80
                dw -75, 18, 89, 50, -50, -89, -18, 75
                dw 57, 80, -25, -90, -9, 87, 43, -70
                dw 64, -64, -64, 64, 64, -64, -64, 64
                dw -70, -43, 87,  9, -90, 25, 80, -57
                dw -50, 89, -18, -75, 75, 18, -89, 50
                dw 80, -9, -70, 87, -25, -57, 90, -43
                dw 36, -83, 83, -36, -36, 83, -83, 36
                dw -87, 57, -9, -43, 80, -90, 70, -25
                dw -18, 50, -75, 89, -89, 75, -50, 18
                dw 90, -87, 80, -70, 57, -43, 25, -9

dct16_shuf1:     times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1

dct16_shuf2:    times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9

tab_dct32_1:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
                dw 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4
                dw 90, 87, 80, 70, 57, 43, 25,  9, -9, -25, -43, -57, -70, -80, -87, -90
                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
                dw 87, 57,  9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87
                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38
                dw 80,  9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80
                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
                dw 70, -43, -87,  9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70
                dw 67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61
                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
                dw 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87,  9, -90, 25, 80, -57
                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78
                dw 43, -90, 57, 25, -87, 70,  9, -80, 80, -9, -70, 87, -25, -57, 90, -43
                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
                dw 31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
                dw 25, -70, 90, -80, 43,  9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25
                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90
                dw 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9
                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90

tab_dct32_2:    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
                dw -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90
                dw -90, -87, -80, -70, -57, -43, -25, -9,  9, 25, 43, 57, 70, 80, 87, 90
                dw 13, 38, 61, 78, 88, 90, 85, 73, 54, 31,  4, -22, -46, -67, -82, -90
                dw 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89
                dw -22, -61, -85, -90, -73, -38,  4, 46, 78, 90, 82, 54, 13, -31, -67, -88
                dw -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43,  9, 57, 87
                dw 31, 78, 90, 61,  4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85
                dw 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83
                dw -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82
                dw -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70,  9, 80
                dw 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82,  4, -78
                dw 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75
                dw -54, -85,  4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73
                dw -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90,  9, -87, -43, 70
                dw 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67
                dw 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64
                dw -67, -54, 78, 38, -85, -22, 90,  4, -90, 13, 88, -31, -82, 46, 73, -61
                dw -57, 80, 25, -90,  9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57
                dw 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88,  4, 85, -54
                dw 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50
                dw -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46
                dw -43, 90, -57, -25, 87, -70, -9, 80, -80,  9, 70, -87, 25, 57, -90, 43
                dw 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67,  4, -73, 88, -38
                dw 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36
                dw -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31
                dw -25, 70, -90, 80, -43, -9, 57, -87, 87, -57,  9, 43, -80, 90, -70, 25
                dw 88, -67, 31, 13, -54, 82, -90, 78, -46,  4, 38, -73, 90, -85, 61, -22
                dw 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18
                dw -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13
                dw -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25,  9
                dw 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4

avx2_idct8_1:   times 4 dw 64, 83, 64, 36
                times 4 dw 64, 36, -64, -83
                times 4 dw 64, -36, -64, 83
                times 4 dw 64, -83, 64, -36

avx2_idct8_2:   times 4 dw 89, 75, 50, 18
                times 4 dw 75, -18, -89, -50
                times 4 dw 50, -89, 18, 75
                times 4 dw 18, -50, 75, -89

avx512_idct8_1:   times 8 dw 64, 83, 64, 36
                  times 8 dw 64, 36, -64, -83
                  times 8 dw 64, -36, -64, 83
                  times 8 dw 64, -83, 64, -36

avx512_idct8_2:   times 8 dw 89, 75, 50, 18
                  times 8 dw 75, -18, -89, -50
                  times 8 dw 50, -89, 18, 75
                  times 8 dw 18, -50, 75, -89

avx512_idct8_3:   dw 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, 83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83, -64, 83
                  dw 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, 36, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83, 64, -83
                  dw -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, -64, -83, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36, 64, -36
                  dw 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 89, 75, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89, 50, -89
                  dw 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 50, 18, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75, 18, 75
                  dw 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 75, -18, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50, 18, -50
                  dw -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, -89, -50, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89, 75, -89

idct8_shuf1:    dd 0, 2, 4, 6, 1, 3, 5, 7

const idct8_shuf2,    times 2 db 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15

idct8_shuf3:    times 2 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3


idct8_avx512_shuf3:    times 4 db 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3

tab_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9
                dw 87, 57, 9, -43, -80, -90, -70, -25
                dw 80, 9, -70, -87, -25, 57, 90, 43
                dw 70, -43, -87, 9, 90, 25, -80, -57
                dw 57, -80, -25, 90, -9, -87, 43, 70
                dw 43, -90, 57, 25, -87, 70, 9, -80
                dw 25, -70, 90, -80, 43, 9, -57, 87
                dw 9, -25, 43, -57, 70, -80, 87, -90

tab_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
                dw 64, 75, 36, -18, -64, -89, -83, -50
                dw 64, 50, -36, -89, -64, 18, 83, 75
                dw 64, 18, -83, -50, 64, 75, -36, -89
                dw 64, -18, -83, 50, 64, -75, -36, 89
                dw 64, -50, -36, 89, -64, -18, 83, -75
                dw 64, -75, 36, 18, -64, 89, -83, 50
                dw 64, -89, 83, -75, 64, -50, 36, -18

idct16_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7

idct16_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5
idct16_shuff2:  dw 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
idct16_shuff3:  dw 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31
idct16_shuff4:  dd 0, 8, 2, 10, 4, 12, 6, 14
idct16_shuff5:  dd 1, 9, 3, 11, 5, 13, 7, 15


tab_AVX512_idct16_1:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43
                       dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87
                       dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90

tab_AVX512_idct16_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75
                       dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50
                       dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18

idct16_AVX512_shuff:   dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15

idct16_AVX512_shuff1:  dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13

idct16_AVX512_shuff2:   dq 0, 1, 8, 9, 4, 5, 12, 13
idct16_AVX512_shuff3:   dq 2, 3, 10, 11, 6, 7, 14, 15
idct16_AVX512_shuff4:   dq 4, 5, 12, 13, 0, 1, 8, 9
idct16_AVX512_shuff5:   dq 6, 7, 14, 15, 2, 3, 10, 11
idct16_AVX512_shuff6:   times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1

tab_idct32_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
                dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
                dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
                dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
                dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
                dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
                dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
                dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
                dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
                dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
                dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
                dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
                dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
                dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
                dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
                dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90


tab_idct32_2:   dw 64, 89, 83, 75, 64, 50, 36, 18
                dw 64, 75, 36, -18, -64, -89, -83, -50
                dw 64, 50, -36, -89, -64, 18, 83, 75
                dw 64, 18, -83, -50, 64, 75, -36, -89
                dw 64, -18, -83, 50, 64, -75, -36, 89
                dw 64, -50, -36, 89, -64, -18, 83, -75
                dw 64, -75, 36, 18, -64, 89, -83, 50
                dw 64, -89, 83, -75, 64, -50, 36, -18


tab_idct32_3:   dw 90, 87, 80, 70, 57, 43, 25, 9
                dw 87, 57, 9, -43, -80, -90, -70, -25
                dw 80, 9, -70, -87, -25, 57, 90, 43
                dw 70, -43, -87, 9, 90, 25, -80, -57
                dw 57, -80, -25, 90, -9, -87, 43, 70
                dw 43, -90, 57, 25, -87, 70, 9, -80
                dw 25, -70, 90, -80, 43, 9, -57, 87
                dw 9, -25, 43, -57, 70, -80, 87, -90

tab_idct32_4:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
                dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
                dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
                dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
                dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
                dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
                dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
                dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
                dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
                dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
                dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
                dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
                dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
                dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
                dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
                dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9


tab_idct32_AVX512_1:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 90 ,90 ,88 ,85, 82, 78, 73, 67, 90, 82, 67, 46, 22, -4, -31, -54, 90, 82, 67, 46, 22, -4, -31, -54
                       dw 61, 54, 46, 38, 31, 22, 13, 4, 61, 54, 46, 38, 31, 22, 13, 4, -73, -85, -90, -88, -78, -61, -38, -13, -73, -85, -90, -88, -78, -61, -38, -13
                       dw 88, 67, 31, -13, -54, -82, -90, -78, 88, 67, 31, -13, -54, -82, -90, -78, 85, 46, -13, -67, -90, -73, -22, 38, 85, 46, -13, -67, -90, -73, -22, 38
                       dw -46, -4, 38, 73, 90, 85, 61, 22, -46, -4, 38, 73, 90, 85, 61, 22, 82, 88, 54, -4, -61, -90, -78, -31, 82, 88, 54, -4, -61, -90, -78, -31
                       dw 82, 22, -54, -90, -61, 13, 78, 85, 82, 22, -54, -90, -61, 13, 78, 85, 78, -4, -82, -73, 13, 85, 67, -22, 78, -4, -82, -73, 13, 85, 67, -22
                       dw 31, -46, -90, -67, 4, 73, 88, 38, 31, -46, -90, -67, 4, 73, 88, 38, -88, -61, 31, 90, 54, -38, -90, -46, -88, -61, 31, 90, 54, -38, -90, -46
                       dw 73, -31, -90, -22, 78, 67, -38, -90, 73, -31, -90, -22, 78, 67, -38, -90, 67, -54, -78, 38, 85, -22, -90, 4, 67, -54, -78, 38, 85, -22, -90, 4
                       dw -13, 82, 61, -46, -88, -4, 85, 54, -13, 82, 61, -46, -88, -4, 85, 54, 90, 13, -88, -31, 82, 46, -73, -61, 90, 13, -88, -31, 82, 46, -73, -61

tab_idct32_AVX512_5:   dw 4, -13, 22, -31, 38, -46, 54, -61, 4, -13, 22, -31, 38, -46, 54, -61, 13, -38, 61, -78, 88, -90, 85, -73, 13, -38, 61, -78, 88, -90, 85, -73
                       dw 67, -73, 78, -82, 85, -88, 90, -90, 67, -73, 78, -82, 85, -88, 90, -90, 54, -31, 4, 22, -46, 67, -82, 90, 54, -31, 4, 22, -46, 67, -82, 90
                       dw 22, -61, 85, -90, 73, -38, -4, 46, 22, -61, 85, -90, 73, -38, -4, 46, 31, -78, 90, -61, 4, 54, -88, 82, 31, -78, 90, -61, 4, 54, -88, 82
                       dw -78, 90, -82, 54, -13, -31, 67, -88, -78, 90, -82, 54, -13, -31, 67, -88, -38, -22, 73, -90, 67, -13, -46, 85, -38, -22, 73, -90, 67, -13, -46, 85
                       dw 38, -88, 73, -4, -67, 90, -46, -31, 38, -88, 73, -4, -67, 90, -46, -31, 46, -90, 38, 54, -90, 31, 61, -88, 46, -90, 38, 54, -90, 31, 61, -88
                       dw 85, -78, 13, 61, -90, 54, 22, -82, 85, -78, 13, 61, -90, 54, 22, -82, 22, 67, -85, 13, 73, -82, 4, 78, 22, 67, -85, 13, 73, -82, 4, 78
                       dw 54, -85, -4, 88, -46, -61, 82, 13, 54, -85, -4, 88, -46, -61, 82, 13, 61, -73, -46, 82, 31, -88, -13, 90, 61, -73, -46, 82, 31, -88, -13, 90
                       dw -90, 38, 67, -78, -22, 90, -31, -73, -90, 38, 67, -78, -22, 90, -31, -73, -4, -90, 22, 85, -38, -78, 54, 67, -4, -90, 22, 85, -38, -78, 54, 67


tab_idct32_AVX512_2:   dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50
                       dw 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89
                       dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75
                       dw 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18

tab_idct32_AVX512_3:   dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25
                       dw 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57
                       dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80
                       dw 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90

tab_idct32_AVX512_4:   dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4
                       dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13
                       dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22
                       dw 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31
                       dw 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38
                       dw 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46
                       dw 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54
                       dw 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61
                       dw 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67
                       dw 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73
                       dw 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78
                       dw 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82
                       dw 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85
                       dw 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88
                       dw 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90
                       dw 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90

tab_idct32_AVX512_6:   dw 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9, 64, 90, 89, 87, 83, 80, 75, 70, 64, 57, 50, 43, 36, 25, 18, 9
                       dw 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25, 64, 87, 75, 57, 36, 9, -18, -43, -64, -80, -89, -90, -83, -70, -50, -25
                       dw 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43, 64, 80, 50, 9, -36, -70, -89, -87, -64, -25, 18, 57, 83, 90, 75, 43
                       dw 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57, 64, 70, 18, -43, -83, -87, -50, 9, 64, 90, 75, 25, -36, -80, -89, -57
                       dw 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70, 64, 57, -18, -80, -83, -25, 50, 90, 64, -9, -75, -87, -36, 43, 89, 70
                       dw 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80, 64, 43, -50, -90, -36, 57, 89, 25, -64, -87, -18, 70, 83, 9, -75, -80
                       dw 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87, 64, 25, -75, -70, 36, 90, 18, -80, -64, 43, 89, 9, -83, -57, 50, 87
                       dw 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90, 64, 9, -89, -25, 83, 43, -75, -57, 64, 70, -50, -80, 36, 87, -18, -90
                       dw 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90, 64, -9, -89, 25, 83, -43, -75, 57, 64, -70, -50, 80, 36, -87, -18, 90
                       dw 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87, 64, -25, -75, 70, 36, -90, 18, 80, -64, -43, 89, -9, -83, 57, 50, -87
                       dw 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80, 64, -43, -50, 90, -36, -57, 89, -25, -64, 87, -18, -70, 83, -9, -75, 80
                       dw 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70, 64, -57, -18, 80, -83, 25, 50, -90, 64, 9, -75, 87, -36, -43, 89, -70
                       dw 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57, 64, -70, 18, 43, -83, 87, -50, -9, 64, -90, 75, -25, -36, 80, -89, 57
                       dw 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43, 64, -80, 50, -9, -36, 70, -89, 87, -64, 25, 18, -57, 83, -90, 75, -43
                       dw 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25, 64, -87, 75, -57, 36, -9, -18, 43, -64, 80, -89, 90, -83, 70, -50, 25
                       dw 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9, 64, -90, 89, -87, 83, -80, 75, -70, 64, -57, 50, -43, 36, -25, 18, -9


avx2_dct4:      dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83

avx2_idct4_1:   dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
                dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36 ,-83, 36, -83

avx2_idct4_2:   dw 64, 64, 64, -64, 83, 36, 36, -83

const idct4_shuf1,    times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15

idct4_shuf2:    times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8 ,9 ,10, 11

tab_dct4:       times 4 dw 64, 64
                times 4 dw 83, 36
                times 4 dw 64, -64
                times 4 dw 36, -83

dct4_shuf:      db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13

tab_dst4:       times 2 dw 29, 55, 74, 84
                times 2 dw 74, 74,  0, -74
                times 2 dw 84, -29, -74, 55
                times 2 dw 55, -84, 74, -29

pw_dst4_tab:    times 4 dw 29,  55,  74,  84
                times 4 dw 74,  74,   0, -74
                times 4 dw 84, -29, -74,  55
                times 4 dw 55, -84,  74, -29

tab_idst4:      times 4 dw 29, +84
                times 4 dw +74, +55
                times 4 dw 55, -29
                times 4 dw +74, -84
                times 4 dw 74, -74
                times 4 dw 0, +74
                times 4 dw 84, +55
                times 4 dw -74, -29

pw_idst4_tab:   times 4 dw  29,  84
                times 4 dw  55, -29
                times 4 dw  74,  55
                times 4 dw  74, -84
                times 4 dw  74, -74
                times 4 dw  84,  55
                times 4 dw  0,   74
                times 4 dw -74, -29
pb_idst4_shuf:  times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15

tab_dct8_1:     times 2 dw 89, 50, 75, 18
                times 2 dw 75, -89, -18, -50
                times 2 dw 50, 18, -89, 75
                times 2 dw 18, 75, -50, -89

tab_dct8_2:     times 2 dd 83, 36
                times 2 dd 36, 83
                times 1 dd 89, 75, 50, 18
                times 1 dd 75, -18, -89, -50
                times 1 dd 50, -89, 18, 75
                times 1 dd 18, -50, 75, -89

tab_idct8_3:    times 4 dw 89, 75
                times 4 dw 50, 18
                times 4 dw 75, -18
                times 4 dw -89, -50
                times 4 dw 50, -89
                times 4 dw 18, 75
                times 4 dw 18, -50
                times 4 dw 75, -89

pb_unpackhlw1:  db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15

pb_idct8even:   db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13

tab_idct8_1:    times 1 dw 64, -64, 36, -83, 64, 64, 83, 36

tab_idct8_2:    times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
                times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
pb_idct8odd:    db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15

;Scale bits table for rdoQuant
tab_nonpsyRdo8 : dq 5, 7, 9, 11
tab_nonpsyRdo10: dq 9, 11, 13, 15
tab_nonpsyRdo12: dq 13, 15, 17, 19

SECTION .text
cextern pd_1
cextern pd_2
cextern pd_4
cextern pd_8
cextern pd_16
cextern pd_32
cextern pd_64
cextern pd_128
cextern pd_256
cextern pd_512
cextern pd_1024
cextern pd_2048
cextern pw_ppppmmmm
cextern trans8_shuf


%if BIT_DEPTH == 12
    %define     DCT4_SHIFT          5
    %define     DCT4_ROUND          16
    %define    IDCT_SHIFT           8
    %define    IDCT_ROUND           128
    %define     DST4_SHIFT          5
    %define     DST4_ROUND          16
    %define     DCT8_SHIFT1         6
    %define     DCT8_ROUND1         32
    %define     RDO_MAX_4           3
    %define     RDO_MAX_8           1
    %define     RDO_MAX_16          0
    %define     RDO_MAX_32          0
%elif BIT_DEPTH == 10
    %define     DCT4_SHIFT          3
    %define     DCT4_ROUND          4
    %define    IDCT_SHIFT           10
    %define    IDCT_ROUND           512
    %define     DST4_SHIFT          3
    %define     DST4_ROUND          4
    %define     DCT8_SHIFT1         4
    %define     DCT8_ROUND1         8
    %define     RDO_MAX_4           7
    %define     RDO_MAX_8           5
    %define     RDO_MAX_16          3
    %define     RDO_MAX_32          1
%elif BIT_DEPTH == 8
    %define     DCT4_SHIFT          1
    %define     DCT4_ROUND          1
    %define    IDCT_SHIFT           12
    %define    IDCT_ROUND           2048
    %define     DST4_SHIFT          1
    %define     DST4_ROUND          1
    %define     DCT8_SHIFT1         2
    %define     DCT8_ROUND1         2
    %define     RDO_MAX_4           11
    %define     RDO_MAX_8           9
    %define     RDO_MAX_16          7
    %define     RDO_MAX_32          5
%else
    %error Unsupported BIT_DEPTH!
%endif

%define         DCT8_ROUND2         256
%define         DCT8_SHIFT2         9

;------------------------------------------------------
;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
    mova        m7, [pd_ %+ DCT4_ROUND]
    add         r2d, r2d
    lea         r3, [tab_dct4]

    mova        m4, [r3 + 0 * 16]
    mova        m5, [r3 + 1 * 16]
    mova        m6, [r3 + 2 * 16]
    movh        m0, [r0 + 0 * r2]
    movh        m1, [r0 + 1 * r2]
    punpcklqdq  m0, m1
    pshufd      m0, m0, 0xD8
    pshufhw     m0, m0, 0xB1

    lea         r0, [r0 + 2 * r2]
    movh        m1, [r0]
    movh        m2, [r0 + r2]
    punpcklqdq  m1, m2
    pshufd      m1, m1, 0xD8
    pshufhw     m1, m1, 0xB1

    punpcklqdq  m2, m0, m1
    punpckhqdq  m0, m1

    paddw       m1, m2, m0
    psubw       m2, m0
    pmaddwd     m0, m1, m4
    paddd       m0, m7
    psrad       m0, DCT4_SHIFT
    pmaddwd     m3, m2, m5
    paddd       m3, m7
    psrad       m3, DCT4_SHIFT
    packssdw    m0, m3
    pshufd      m0, m0, 0xD8
    pshufhw     m0, m0, 0xB1
    pmaddwd     m1, m6
    paddd       m1, m7
    psrad       m1, DCT4_SHIFT
    pmaddwd     m2, [r3 + 3 * 16]
    paddd       m2, m7
    psrad       m2, DCT4_SHIFT
    packssdw    m1, m2
    pshufd      m1, m1, 0xD8
    pshufhw     m1, m1, 0xB1

    punpcklqdq  m2, m0, m1
    punpckhqdq  m0, m1

    mova        m7, [pd_128]

    pmaddwd     m1, m2, m4
    pmaddwd     m3, m0, m4
    paddd       m1, m3
    paddd       m1, m7
    psrad       m1, 8

    pmaddwd     m4, m2, m5
    pmaddwd     m3, m0, m5
    psubd       m4, m3
    paddd       m4, m7
    psrad       m4, 8
    packssdw    m1, m4
    movu        [r1 + 0 * 16], m1

    pmaddwd     m1, m2, m6
    pmaddwd     m3, m0, m6
    paddd       m1, m3
    paddd       m1, m7
    psrad       m1, 8

    pmaddwd     m2, [r3 + 3 * 16]
    pmaddwd     m0, [r3 + 3 * 16]
    psubd       m2, m0
    paddd       m2, m7
    psrad       m2, 8
    packssdw    m1, m2
    movu        [r1 + 1 * 16], m1
    RET

; DCT 4x4
;
; Input parameters:
; - r0:     source
; - r1:     destination
; - r2:     source stride
INIT_YMM avx2
cglobal dct4, 3, 4, 8, src, dst, srcStride
    vbroadcasti128  m7, [pd_ %+ DCT4_ROUND]
    add             r2d, r2d
    lea             r3, [avx2_dct4]

    vbroadcasti128  m4, [dct4_shuf]
    mova            m5, [r3]
    mova            m6, [r3 + 32]
    movq            xm0, [r0]
    movhps          xm0, [r0 + r2]
    lea             r0, [r0 + 2 * r2]
    movq            xm1, [r0]
    movhps          xm1, [r0 + r2]

    vinserti128     m0, m0, xm1, 1
    pshufb          m0, m4
    vpermq          m1, m0, 11011101b
    vpermq          m0, m0, 10001000b
    paddw           m2, m0, m1
    psubw           m0, m1

    pmaddwd         m2, m5
    paddd           m2, m7
    psrad           m2, DCT4_SHIFT

    pmaddwd         m0, m6
    paddd           m0, m7
    psrad           m0, DCT4_SHIFT

    packssdw        m2, m0
    pshufb          m2, m4
    vpermq          m1, m2, 11011101b
    vpermq          m2, m2, 10001000b
    vbroadcasti128  m7, [pd_128]

    pmaddwd         m0, m2, m5
    pmaddwd         m3, m1, m5
    paddd           m3, m0
    paddd           m3, m7
    psrad           m3, 8

    pmaddwd         m2, m6
    pmaddwd         m1, m6
    psubd           m2, m1
    paddd           m2, m7
    psrad           m2, 8

    packssdw        m3, m2
    movu            [r1], m3
    RET

;-------------------------------------------------------
;void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idct4, 3, 4, 6
    add         r2d, r2d
    lea         r3, [tab_dct4]

    movu        m0, [r0 + 0 * 16]
    movu        m1, [r0 + 1 * 16]

    punpcklwd   m2, m0, m1
    pmaddwd     m3, m2, [r3 + 0 * 16]       ; m3 = E1
    paddd       m3, [pd_64]

    pmaddwd     m2, [r3 + 2 * 16]           ; m2 = E2
    paddd       m2, [pd_64]

    punpckhwd   m0, m1
    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2

    paddd       m4, m3, m1
    psrad       m4, 7                       ; m4 = m128iA
    paddd       m5, m2, m0
    psrad       m5, 7
    packssdw    m4, m5                      ; m4 = m128iA

    psubd       m2, m0
    psrad       m2, 7
    psubd       m3, m1
    psrad       m3, 7
    packssdw    m2, m3                      ; m2 = m128iD

    punpcklwd   m1, m4, m2                  ; m1 = S0
    punpckhwd   m4, m2                      ; m4 = S8

    punpcklwd   m0, m1, m4                  ; m0 = m128iA
    punpckhwd   m1, m4                      ; m1 = m128iD

    punpcklwd   m2, m0, m1
    pmaddwd     m3, m2, [r3 + 0 * 16]
    paddd       m3, [pd_ %+ IDCT_ROUND]     ; m3 = E1

    pmaddwd     m2, [r3 + 2 * 16]
    paddd       m2, [pd_ %+ IDCT_ROUND]     ; m2 = E2

    punpckhwd   m0, m1
    pmaddwd     m1, m0, [r3 + 1 * 16]       ; m1 = O1
    pmaddwd     m0, [r3 + 3 * 16]           ; m0 = O2

    paddd       m4, m3, m1
    psrad       m4, IDCT_SHIFT              ; m4 = m128iA
    paddd       m5, m2, m0
    psrad       m5, IDCT_SHIFT
    packssdw    m4, m5                      ; m4 = m128iA

    psubd       m2, m0
    psrad       m2, IDCT_SHIFT
    psubd       m3, m1
    psrad       m3, IDCT_SHIFT
    packssdw    m2, m3                      ; m2 = m128iD

    punpcklwd   m1, m4, m2
    punpckhwd   m4, m2

    punpcklwd   m0, m1, m4
    movlps      [r1 + 0 * r2], m0
    movhps      [r1 + 1 * r2], m0

    punpckhwd   m1, m4
    movlps      [r1 + 2 * r2], m1
    lea         r1, [r1 + 2 * r2]
    movhps      [r1 + r2], m1
    RET

;------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM sse2
%if ARCH_X86_64
cglobal dst4, 3, 4, 8+4
  %define       coef0   m8
  %define       coef1   m9
  %define       coef2   m10
  %define       coef3   m11
%else ; ARCH_X86_64 = 0
cglobal dst4, 3, 4, 8
  %define       coef0   [r3 + 0 * 16]
  %define       coef1   [r3 + 1 * 16]
  %define       coef2   [r3 + 2 * 16]
  %define       coef3   [r3 + 3 * 16]
%endif ; ARCH_X86_64

    mova        m5, [pd_ %+ DST4_ROUND]
    add         r2d, r2d
    lea         r3, [tab_dst4]
%if ARCH_X86_64
    mova        coef0, [r3 + 0 * 16]
    mova        coef1, [r3 + 1 * 16]
    mova        coef2, [r3 + 2 * 16]
    mova        coef3, [r3 + 3 * 16]
%endif
    movh        m0, [r0 + 0 * r2]            ; load
    movhps      m0, [r0 + 1 * r2]
    lea         r0, [r0 + 2 * r2]
    movh        m1, [r0]
    movhps      m1, [r0 + r2]
    pmaddwd     m2, m0, coef0                ; DST1
    pmaddwd     m3, m1, coef0
    pshufd      m6, m2, q2301
    pshufd      m7, m3, q2301
    paddd       m2, m6
    paddd       m3, m7
    pshufd      m2, m2, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m2, m3
    paddd       m2, m5
    psrad       m2, DST4_SHIFT
    pmaddwd     m3, m0, coef1
    pmaddwd     m4, m1, coef1
    pshufd      m6, m4, q2301
    pshufd      m7, m3, q2301
    paddd       m4, m6
    paddd       m3, m7
    pshufd      m4, m4, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    packssdw    m2, m3                       ; m2 = T70
    pmaddwd     m3, m0, coef2
    pmaddwd     m4, m1, coef2
    pshufd      m6, m4, q2301
    pshufd      m7, m3, q2301
    paddd       m4, m6
    paddd       m3, m7
    pshufd      m4, m4, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    pmaddwd     m0, coef3
    pmaddwd     m1, coef3
    pshufd      m6, m0, q2301
    pshufd      m7, m1, q2301
    paddd       m0, m6
    paddd       m1, m7
    pshufd      m0, m0, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m0, m1
    paddd       m0, m5
    psrad       m0, DST4_SHIFT
    packssdw    m3, m0                       ; m3 = T71
    mova        m5, [pd_128]

    pmaddwd     m0, m2, coef0                ; DST2
    pmaddwd     m1, m3, coef0
    pshufd      m6, m0, q2301
    pshufd      m7, m1, q2301
    paddd       m0, m6
    paddd       m1, m7
    pshufd      m0, m0, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m4, m2, coef1
    pmaddwd     m1, m3, coef1
    pshufd      m6, m4, q2301
    pshufd      m7, m1, q2301
    paddd       m4, m6
    paddd       m1, m7
    pshufd      m4, m4, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m4, m1
    paddd       m4, m5
    psrad       m4, 8
    packssdw    m0, m4
    movu        [r1 + 0 * 16], m0

    pmaddwd     m0, m2, coef2
    pmaddwd     m1, m3, coef2
    pshufd      m6, m0, q2301
    pshufd      m7, m1, q2301
    paddd       m0, m6
    paddd       m1, m7
    pshufd      m0, m0, q3120
    pshufd      m1, m1, q3120
    punpcklqdq  m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m2, coef3
    pmaddwd     m3, coef3
    pshufd      m6, m2, q2301
    pshufd      m7, m3, q2301
    paddd       m2, m6
    paddd       m3, m7
    pshufd      m2, m2, q3120
    pshufd      m3, m3, q3120
    punpcklqdq  m2, m3
    paddd       m2, m5
    psrad       m2, 8
    packssdw    m0, m2
    movu        [r1 + 1 * 16], m0
    RET

;------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
INIT_XMM ssse3
%if ARCH_X86_64
cglobal dst4, 3, 4, 8+2
  %define       coef2   m8
  %define       coef3   m9
%else ; ARCH_X86_64 = 0
cglobal dst4, 3, 4, 8
  %define       coef2   [r3 + 2 * 16]
  %define       coef3   [r3 + 3 * 16]
%endif ; ARCH_X86_64
%define         coef0   m6
%define         coef1   m7

    mova        m5, [pd_ %+ DST4_ROUND]
    add         r2d, r2d
    lea         r3, [tab_dst4]
    mova        coef0, [r3 + 0 * 16]
    mova        coef1, [r3 + 1 * 16]
%if ARCH_X86_64
    mova        coef2, [r3 + 2 * 16]
    mova        coef3, [r3 + 3 * 16]
%endif
    movh        m0, [r0 + 0 * r2]            ; load
    movh        m1, [r0 + 1 * r2]
    punpcklqdq  m0, m1
    lea         r0, [r0 + 2 * r2]
    movh        m1, [r0]
    movh        m2, [r0 + r2]
    punpcklqdq  m1, m2
    pmaddwd     m2, m0, coef0                ; DST1
    pmaddwd     m3, m1, coef0
    phaddd      m2, m3
    paddd       m2, m5
    psrad       m2, DST4_SHIFT
    pmaddwd     m3, m0, coef1
    pmaddwd     m4, m1, coef1
    phaddd      m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    packssdw    m2, m3                       ; m2 = T70
    pmaddwd     m3, m0, coef2
    pmaddwd     m4, m1, coef2
    phaddd      m3, m4
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    pmaddwd     m0, coef3
    pmaddwd     m1, coef3
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, DST4_SHIFT
    packssdw    m3, m0                       ; m3 = T71
    mova        m5, [pd_128]

    pmaddwd     m0, m2, coef0                ; DST2
    pmaddwd     m1, m3, coef0
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m4, m2, coef1
    pmaddwd     m1, m3, coef1
    phaddd      m4, m1
    paddd       m4, m5
    psrad       m4, 8
    packssdw    m0, m4
    movu        [r1 + 0 * 16], m0

    pmaddwd     m0, m2, coef2
    pmaddwd     m1, m3, coef2
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, 8

    pmaddwd     m2, coef3
    pmaddwd     m3, coef3
    phaddd      m2, m3
    paddd       m2, m5
    psrad       m2, 8
    packssdw    m0, m2
    movu        [r1 + 1 * 16], m0
    RET

;------------------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------------------
INIT_YMM avx2
cglobal dst4, 3, 4, 6
    vbroadcasti128 m5, [pd_ %+ DST4_ROUND]
    mova        m4, [trans8_shuf]
    add         r2d, r2d
    lea         r3, [pw_dst4_tab]

    movq        xm0, [r0 + 0 * r2]
    movhps      xm0, [r0 + 1 * r2]
    lea         r0, [r0 + 2 * r2]
    movq        xm1, [r0]
    movhps      xm1, [r0 + r2]

    vinserti128 m0, m0, xm1, 1          ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]

    pmaddwd     m2, m0, [r3 + 0 * 32]
    pmaddwd     m1, m0, [r3 + 1 * 32]
    phaddd      m2, m1
    paddd       m2, m5
    psrad       m2, DST4_SHIFT
    pmaddwd     m3, m0, [r3 + 2 * 32]
    pmaddwd     m1, m0, [r3 + 3 * 32]
    phaddd      m3, m1
    paddd       m3, m5
    psrad       m3, DST4_SHIFT
    packssdw    m2, m3
    vpermd      m2, m4, m2

    vpbroadcastd m5, [pd_128]
    pmaddwd     m0, m2, [r3 + 0 * 32]
    pmaddwd     m1, m2, [r3 + 1 * 32]
    phaddd      m0, m1
    paddd       m0, m5
    psrad       m0, 8
    pmaddwd     m3, m2, [r3 + 2 * 32]
    pmaddwd     m2, m2, [r3 + 3 * 32]
    phaddd      m3, m2
    paddd       m3, m5
    psrad       m3, 8
    packssdw    m0, m3
    vpermd      m0, m4, m0
    movu        [r1], m0
    RET

;-------------------------------------------------------
;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal idst4, 3, 4, 7
    mova        m6, [pd_ %+ IDCT_ROUND]
    add         r2d, r2d
    lea         r3, [tab_idst4]
    mova        m5, [pd_64]

    movu        m0, [r0 + 0 * 16]
    movu        m1, [r0 + 1 * 16]

    punpcklwd   m2, m0, m1                  ; m2 = m128iAC
    punpckhwd   m0, m1                      ; m0 = m128iBD

    pmaddwd     m1, m2, [r3 + 0 * 16]
    pmaddwd     m3, m0, [r3 + 1 * 16]
    paddd       m1, m3
    paddd       m1, m5
    psrad       m1, 7                       ; m1 = S0

    pmaddwd     m3, m2, [r3 + 2 * 16]
    pmaddwd     m4, m0, [r3 + 3 * 16]
    paddd       m3, m4
    paddd       m3, m5
    psrad       m3, 7                       ; m3 = S8
    packssdw    m1, m3                      ; m1 = m128iA

    pmaddwd     m3, m2, [r3 + 4 * 16]
    pmaddwd     m4, m0, [r3 + 5 * 16]
    paddd       m3, m4
    paddd       m3, m5
    psrad       m3, 7                       ; m3 = S0

    pmaddwd     m2, [r3 + 6 * 16]
    pmaddwd     m0, [r3 + 7 * 16]
    paddd       m2, m0
    paddd       m2, m5
    psrad       m2, 7                       ; m2 = S8
    packssdw    m3, m2                      ; m3 = m128iD

    punpcklwd   m0, m1, m3
    punpckhwd   m1, m3

    punpcklwd   m2, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m1, m2, m0
    punpckhwd   m2, m0
    pmaddwd     m0, m1, [r3 + 0 * 16]
    pmaddwd     m3, m2, [r3 + 1 * 16]
    paddd       m0, m3
    paddd       m0, m6
    psrad       m0, IDCT_SHIFT              ; m0 = S0
    pmaddwd     m3, m1, [r3 + 2 * 16]
    pmaddwd     m4, m2, [r3 + 3 * 16]
    paddd       m3, m4
    paddd       m3, m6
    psrad       m3, IDCT_SHIFT              ; m3 = S8
    packssdw    m0, m3                      ; m0 = m128iA
    pmaddwd     m3, m1, [r3 + 4 * 16]
    pmaddwd     m4, m2, [r3 + 5 * 16]
    paddd       m3, m4
    paddd       m3, m6
    psrad       m3, IDCT_SHIFT              ; m3 = S0
    pmaddwd     m1, [r3 + 6 * 16]
    pmaddwd     m2, [r3 + 7 * 16]
    paddd       m1, m2
    paddd       m1, m6
    psrad       m1, IDCT_SHIFT              ; m1 = S8
    packssdw    m3, m1                      ; m3 = m128iD
    punpcklwd   m1, m0, m3
    punpckhwd   m0, m3

    punpcklwd   m2, m1, m0
    movlps      [r1 + 0 * r2], m2
    movhps      [r1 + 1 * r2], m2

    punpckhwd   m1, m0
    movlps      [r1 + 2 * r2], m1
    lea         r1, [r1 + 2 * r2]
    movhps      [r1 + r2], m1
    RET

;-----------------------------------------------------------------
;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-----------------------------------------------------------------
INIT_YMM avx2
cglobal idst4, 3, 4, 6
    vbroadcasti128 m4, [pd_ %+ IDCT_ROUND]
    add         r2d, r2d
    lea         r3, [pw_idst4_tab]

    movu        xm0, [r0 + 0 * 16]
    movu        xm1, [r0 + 1 * 16]

    punpcklwd   m2, m0, m1
    punpckhwd   m0, m1

    vinserti128 m2, m2, xm2, 1
    vinserti128 m0, m0, xm0, 1

    vpbroadcastd m5, [pd_64]
    pmaddwd     m1, m2, [r3 + 0 * 32]
    pmaddwd     m3, m0, [r3 + 1 * 32]
    paddd       m1, m3
    paddd       m1, m5
    psrad       m1, 7
    pmaddwd     m3, m2, [r3 + 2 * 32]
    pmaddwd     m0, [r3 + 3 * 32]
    paddd       m3, m0
    paddd       m3, m5
    psrad       m3, 7

    packssdw    m0, m1, m3
    pshufb      m0, [pb_idst4_shuf]
    vpermq      m1, m0, 11101110b

    punpcklwd   m2, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m1, m2, m0
    punpckhwd   m2, m0

    vpermq      m1, m1, 01000100b
    vpermq      m2, m2, 01000100b

    pmaddwd     m0, m1, [r3 + 0 * 32]
    pmaddwd     m3, m2, [r3 + 1 * 32]
    paddd       m0, m3
    paddd       m0, m4
    psrad       m0, IDCT_SHIFT
    pmaddwd     m3, m1, [r3 + 2 * 32]
    pmaddwd     m2, m2, [r3 + 3 * 32]
    paddd       m3, m2
    paddd       m3, m4
    psrad       m3, IDCT_SHIFT

    packssdw    m0, m3
    pshufb      m1, m0, [pb_idst4_shuf]
    vpermq      m0, m1, 11101110b

    punpcklwd   m2, m1, m0
    movq        [r1 + 0 * r2], xm2
    movhps      [r1 + 1 * r2], xm2

    punpckhwd   m1, m0
    movq        [r1 + 2 * r2], xm1
    lea         r1, [r1 + 2 * r2]
    movhps      [r1 + r2], xm1
    RET

;-------------------------------------------------------
; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse2
cglobal dct8, 3,6,8,0-16*mmsize
    ;------------------------
    ; Stack Mapping(dword)
    ;------------------------
    ; Row0[0-3] Row1[0-3]
    ; ...
    ; Row6[0-3] Row7[0-3]
    ; Row0[0-3] Row7[0-3]
    ; ...
    ; Row6[4-7] Row7[4-7]
    ;------------------------

    add         r2, r2
    lea         r3, [r2 * 3]
    mov         r5, rsp
%assign x 0
%rep 2
    movu        m0, [r0]
    movu        m1, [r0 + r2]
    movu        m2, [r0 + r2 * 2]
    movu        m3, [r0 + r3]

    punpcklwd   m4, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m5, m2, m3
    punpckhwd   m2, m3
    punpckldq   m1, m4, m5          ; m1 = [1 0]
    punpckhdq   m4, m5              ; m4 = [3 2]
    punpckldq   m3, m0, m2
    punpckhdq   m0, m2
    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
    pshufd      m0, m0, 0x4E        ; m0 = [6 7]

    paddw       m3, m1, m0
    psubw       m1, m0              ; m1 = [d1 d0]
    paddw       m0, m4, m2
    psubw       m4, m2              ; m4 = [d3 d2]
    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
    punpckhqdq  m3, m0
    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]

    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
    punpckhwd   m1, m4              ; m1 = [d3/d1]
    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]

    ; odd
    lea         r4, [tab_dct8_1]
    pmaddwd     m1, m4, [r4 + 0*16]
    pmaddwd     m5, m0, [r4 + 0*16]
    pshufd      m1, m1, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m1
    punpckhqdq  m7, m5
    punpcklqdq  m1, m5
    paddd       m1, m7
    paddd       m1, [pd_ %+ DCT8_ROUND1]
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 1*2*mmsize], m1 ; Row 1

    pmaddwd     m1, m4, [r4 + 1*16]
    pmaddwd     m5, m0, [r4 + 1*16]
    pshufd      m1, m1, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m1
    punpckhqdq  m7, m5
    punpcklqdq  m1, m5
    paddd       m1, m7
    paddd       m1, [pd_ %+ DCT8_ROUND1]
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 3*2*mmsize], m1 ; Row 3

    pmaddwd     m1, m4, [r4 + 2*16]
    pmaddwd     m5, m0, [r4 + 2*16]
    pshufd      m1, m1, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m1
    punpckhqdq  m7, m5
    punpcklqdq  m1, m5
    paddd       m1, m7
    paddd       m1, [pd_ %+ DCT8_ROUND1]
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 5*2*mmsize], m1 ; Row 5

    pmaddwd     m4, [r4 + 3*16]
    pmaddwd     m0, [r4 + 3*16]
    pshufd      m4, m4, 0xD8
    pshufd      m0, m0, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m0
    punpcklqdq  m4, m0
    paddd       m4, m7
    paddd       m4, [pd_ %+ DCT8_ROUND1]
    psrad       m4, DCT8_SHIFT1
  %if x == 1
    pshufd      m4, m4, 0x1B
  %endif
    mova        [r5 + 7*2*mmsize], m4; Row 7

    ; even
    lea         r4, [tab_dct4]
    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
    pshufd      m0, m0, 0xD8
    pshuflw     m0, m0, 0xD8
    pshufhw     m0, m0, 0xD8
    psubw       m2, m3              ; m2 = [EO1 EO0]
    pmullw      m2, [pw_ppppmmmm]
    pshufd      m2, m2, 0xD8
    pshuflw     m2, m2, 0xD8
    pshufhw     m2, m2, 0xD8
    pmaddwd     m3, m0, [r4 + 0*16]
    paddd       m3, [pd_ %+ DCT8_ROUND1]
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 0*2*mmsize], m3 ; Row 0
    pmaddwd     m0, [r4 + 2*16]
    paddd       m0, [pd_ %+ DCT8_ROUND1]
    psrad       m0, DCT8_SHIFT1
  %if x == 1
    pshufd      m0, m0, 0x1B
  %endif
    mova        [r5 + 4*2*mmsize], m0 ; Row 4
    pmaddwd     m3, m2, [r4 + 1*16]
    paddd       m3, [pd_ %+ DCT8_ROUND1]
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 2*2*mmsize], m3 ; Row 2
    pmaddwd     m2, [r4 + 3*16]
    paddd       m2, [pd_ %+ DCT8_ROUND1]
    psrad       m2, DCT8_SHIFT1
  %if x == 1
    pshufd      m2, m2, 0x1B
  %endif
    mova        [r5 + 6*2*mmsize], m2 ; Row 6

  %if x != 1
    lea         r0, [r0 + r2 * 4]
    add         r5, mmsize
  %endif
%assign x x+1
%endrep

    mov         r0, rsp                 ; r0 = pointer to Low Part
    lea         r4, [tab_dct8_2]

%assign x 0
%rep 4
    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
    mova        m1, [r0 + 1*2*mmsize]
    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
    pshufd      m3, m3, 0x9C            ; m3 = ^^
    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^

    ; even
    pshufd      m4, m2, 0xD8
    pshufd      m3, m3, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m3
    punpcklqdq  m4, m3
    mova        m2, m4
    paddd       m4, m7                  ; m4 = [EE1 EE0 EE1 EE0]
    psubd       m2, m7                  ; m2 = [EO1 EO0 EO1 EO0]

    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
    mova        m5, m2
    pmuludq     m5, [r4 + 0*16]
    pshufd      m7, m2, 0xF5
    movu        m6, [r4 + 0*16 + 4]
    pmuludq     m7, m6
    pshufd      m5, m5, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m5, m7                  ; m5 = [36*EO1 83*EO0]
    pshufd      m7, m2, 0xF5
    pmuludq     m2, [r4 + 1*16]
    movu        m6, [r4 + 1*16 + 4]
    pmuludq     m7, m6
    pshufd      m2, m2, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m2, m7                  ; m2 = [83*EO1 36*EO0]

    pshufd      m3, m4, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m3
    punpckhqdq  m7, m5
    punpcklqdq  m3, m5
    paddd       m3, m7                  ; m3 = [Row2 Row0]
    paddd       m3, [pd_ %+ DCT8_ROUND2]
    psrad       m3, DCT8_SHIFT2
    pshufd      m4, m4, 0xD8
    pshufd      m2, m2, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m2
    punpcklqdq  m4, m2
    psubd       m4, m7                  ; m4 = [Row6 Row4]
    paddd       m4, [pd_ %+ DCT8_ROUND2]
    psrad       m4, DCT8_SHIFT2

    packssdw    m3, m3
    movd        [r1 + 0*mmsize], m3
    pshufd      m3, m3, 1
    movd        [r1 + 2*mmsize], m3

    packssdw    m4, m4
    movd        [r1 + 4*mmsize], m4
    pshufd      m4, m4, 1
    movd        [r1 + 6*mmsize], m4

    ; odd
    mova        m2, m0
    pmuludq     m2, [r4 + 2*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 2*16 + 4]
    pmuludq     m7, m6
    pshufd      m2, m2, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m2, m7
    mova        m3, m1
    pmuludq     m3, [r4 + 2*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m3, m3, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m3, m7
    mova        m4, m0
    pmuludq     m4, [r4 + 3*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 3*16 + 4]
    pmuludq     m7, m6
    pshufd      m4, m4, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m4, m7
    mova        m5, m1
    pmuludq     m5, [r4 + 3*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m5, m5, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m5, m7
    pshufd      m2, m2, 0xD8
    pshufd      m3, m3, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m3
    punpcklqdq  m2, m3
    paddd       m2, m7
    pshufd      m4, m4, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m5
    punpcklqdq  m4, m5
    paddd       m4, m7
    pshufd      m2, m2, 0xD8
    pshufd      m4, m4, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m4
    punpcklqdq  m2, m4
    paddd       m2, m7                  ; m2 = [Row3 Row1]
    paddd       m2, [pd_ %+ DCT8_ROUND2]
    psrad       m2, DCT8_SHIFT2

    packssdw    m2, m2
    movd        [r1 + 1*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 3*mmsize], m2

    mova        m2, m0
    pmuludq     m2, [r4 + 4*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 4*16 + 4]
    pmuludq     m7, m6
    pshufd      m2, m2, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m2, m7
    mova        m3, m1
    pmuludq     m3, [r4 + 4*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m3, m3, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m3, m7
    mova        m4, m0
    pmuludq     m4, [r4 + 5*16]
    pshufd      m7, m0, 0xF5
    movu        m6, [r4 + 5*16 + 4]
    pmuludq     m7, m6
    pshufd      m4, m4, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m4, m7
    mova        m5, m1
    pmuludq     m5, [r4 + 5*16]
    pshufd      m7, m1, 0xF5
    pmuludq     m7, m6
    pshufd      m5, m5, 0x88
    pshufd      m7, m7, 0x88
    punpckldq   m5, m7
    pshufd      m2, m2, 0xD8
    pshufd      m3, m3, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m3
    punpcklqdq  m2, m3
    paddd       m2, m7
    pshufd      m4, m4, 0xD8
    pshufd      m5, m5, 0xD8
    mova        m7, m4
    punpckhqdq  m7, m5
    punpcklqdq  m4, m5
    paddd       m4, m7
    pshufd      m2, m2, 0xD8
    pshufd      m4, m4, 0xD8
    mova        m7, m2
    punpckhqdq  m7, m4
    punpcklqdq  m2, m4
    paddd       m2, m7                  ; m2 = [Row7 Row5]
    paddd       m2, [pd_ %+ DCT8_ROUND2]
    psrad       m2, DCT8_SHIFT2

    packssdw    m2, m2
    movd        [r1 + 5*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 7*mmsize], m2
%if x < 3
    add         r1, mmsize/4
    add         r0, 2*2*mmsize
%endif
%assign x x+1
%endrep

    RET

;-------------------------------------------------------
; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
;-------------------------------------------------------
INIT_XMM sse4
cglobal dct8, 3,6,7,0-16*mmsize
    ;------------------------
    ; Stack Mapping(dword)
    ;------------------------
    ; Row0[0-3] Row1[0-3]
    ; ...
    ; Row6[0-3] Row7[0-3]
    ; Row0[0-3] Row7[0-3]
    ; ...
    ; Row6[4-7] Row7[4-7]
    ;------------------------
    mova        m6, [pd_ %+ DCT8_ROUND1]

    add         r2, r2
    lea         r3, [r2 * 3]
    mov         r5, rsp
%assign x 0
%rep 2
    movu        m0, [r0]
    movu        m1, [r0 + r2]
    movu        m2, [r0 + r2 * 2]
    movu        m3, [r0 + r3]

    punpcklwd   m4, m0, m1
    punpckhwd   m0, m1
    punpcklwd   m5, m2, m3
    punpckhwd   m2, m3
    punpckldq   m1, m4, m5          ; m1 = [1 0]
    punpckhdq   m4, m5              ; m4 = [3 2]
    punpckldq   m3, m0, m2
    punpckhdq   m0, m2
    pshufd      m2, m3, 0x4E        ; m2 = [4 5]
    pshufd      m0, m0, 0x4E        ; m0 = [6 7]

    paddw       m3, m1, m0
    psubw       m1, m0              ; m1 = [d1 d0]
    paddw       m0, m4, m2
    psubw       m4, m2              ; m4 = [d3 d2]
    punpcklqdq  m2, m3, m0          ; m2 = [s2 s0]
    punpckhqdq  m3, m0
    pshufd      m3, m3, 0x4E        ; m3 = [s1 s3]

    punpcklwd   m0, m1, m4          ; m0 = [d2/d0]
    punpckhwd   m1, m4              ; m1 = [d3/d1]
    punpckldq   m4, m0, m1          ; m4 = [d3 d1 d2 d0]
    punpckhdq   m0, m1              ; m0 = [d3 d1 d2 d0]

    ; odd
    lea         r4, [tab_dct8_1]
    pmaddwd     m1, m4, [r4 + 0*16]
    pmaddwd     m5, m0, [r4 + 0*16]
    phaddd      m1, m5
    paddd       m1, m6
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 1*2*mmsize], m1 ; Row 1

    pmaddwd     m1, m4, [r4 + 1*16]
    pmaddwd     m5, m0, [r4 + 1*16]
    phaddd      m1, m5
    paddd       m1, m6
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 3*2*mmsize], m1 ; Row 3

    pmaddwd     m1, m4, [r4 + 2*16]
    pmaddwd     m5, m0, [r4 + 2*16]
    phaddd      m1, m5
    paddd       m1, m6
    psrad       m1, DCT8_SHIFT1
  %if x == 1
    pshufd      m1, m1, 0x1B
  %endif
    mova        [r5 + 5*2*mmsize], m1 ; Row 5

    pmaddwd     m4, [r4 + 3*16]
    pmaddwd     m0, [r4 + 3*16]
    phaddd      m4, m0
    paddd       m4, m6
    psrad       m4, DCT8_SHIFT1
  %if x == 1
    pshufd      m4, m4, 0x1B
  %endif
    mova        [r5 + 7*2*mmsize], m4; Row 7

    ; even
    lea         r4, [tab_dct4]
    paddw       m0, m2, m3          ; m0 = [EE1 EE0]
    pshufb      m0, [pb_unpackhlw1]
    psubw       m2, m3              ; m2 = [EO1 EO0]
    psignw      m2, [pw_ppppmmmm]
    pshufb      m2, [pb_unpackhlw1]
    pmaddwd     m3, m0, [r4 + 0*16]
    paddd       m3, m6
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 0*2*mmsize], m3 ; Row 0
    pmaddwd     m0, [r4 + 2*16]
    paddd       m0, m6
    psrad       m0, DCT8_SHIFT1
  %if x == 1
    pshufd      m0, m0, 0x1B
  %endif
    mova        [r5 + 4*2*mmsize], m0 ; Row 4
    pmaddwd     m3, m2, [r4 + 1*16]
    paddd       m3, m6
    psrad       m3, DCT8_SHIFT1
  %if x == 1
    pshufd      m3, m3, 0x1B
  %endif
    mova        [r5 + 2*2*mmsize], m3 ; Row 2
    pmaddwd     m2, [r4 + 3*16]
    paddd       m2, m6
    psrad       m2, DCT8_SHIFT1
  %if x == 1
    pshufd      m2, m2, 0x1B
  %endif
    mova        [r5 + 6*2*mmsize], m2 ; Row 6

  %if x != 1
    lea         r0, [r0 + r2 * 4]
    add         r5, mmsize
  %endif
%assign x x+1
%endrep

    mov         r2, 2
    mov         r0, rsp                 ; r0 = pointer to Low Part
    lea         r4, [tab_dct8_2]
    mova        m6, [pd_256]

.pass2:
%rep 2
    mova        m0, [r0 + 0*2*mmsize]     ; [3 2 1 0]
    mova        m1, [r0 + 1*2*mmsize]
    paddd       m2, m0, [r0 + (0*2+1)*mmsize]
    pshufd      m2, m2, 0x9C            ; m2 = [s2 s1 s3 s0]
    paddd       m3, m1, [r0 + (1*2+1)*mmsize]
    pshufd      m3, m3, 0x9C            ; m3 = ^^
    psubd       m0, [r0 + (0*2+1)*mmsize]     ; m0 = [d3 d2 d1 d0]
    psubd       m1, [r0 + (1*2+1)*mmsize]     ; m1 = ^^

    ; even
    phaddd      m4, m2, m3              ; m4 = [EE1 EE0 EE1 EE0]
    phsubd      m2, m3                  ; m2 = [EO1 EO0 EO1 EO0]

    pslld       m4, 6                   ; m4 = [64*EE1 64*EE0]
    pmulld      m5, m2, [r4 + 0*16]     ; m5 = [36*EO1 83*EO0]
    pmulld      m2, [r4 + 1*16]         ; m2 = [83*EO1 36*EO0]

    phaddd      m3, m4, m5              ; m3 = [Row2 Row0]
    paddd       m3, m6
    psrad       m3, 9
    phsubd      m4, m2                  ; m4 = [Row6 Row4]
    paddd       m4, m6
    psrad       m4, 9

    packssdw    m3, m3
    movd        [r1 + 0*mmsize], m3
    pshufd      m3, m3, 1
    movd        [r1 + 2*mmsize], m3

    packssdw    m4, m4
    movd        [r1 + 4*mmsize], m4
    pshufd      m4, m4, 1
    movd        [r1 + 6*mmsize], m4

    ; odd
    pmulld      m2, m0, [r4 + 2*16]
    pmulld      m3, m1, [r4 + 2*16]
    pmulld      m4, m0, [r4 + 3*16]
    pmulld      m5, m1, [r4 + 3*16]
    phaddd      m2, m3
    phaddd      m4, m5
    phaddd      m2, m4                  ; m2 = [Row3 Row1]
    paddd       m2, m6
    psrad       m2, 9

    packssdw    m2, m2
    movd        [r1 + 1*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 3*mmsize], m2

    pmulld      m2, m0, [r4 + 4*16]
    pmulld      m3, m1, [r4 + 4*16]
    pmulld      m4, m0, [r4 + 5*16]
    pmulld      m5, m1, [r4 + 5*16]
    phaddd      m2, m3
    phaddd      m4, m5
    phaddd      m2, m4                  ; m2 = [Row7 Row5]
    paddd       m2, m6
    psrad       m2, 9

    packssdw    m2, m2
    movd        [r1 + 5*mmsize], m2
    pshufd      m2, m2, 1
    movd        [r1 + 7*mmsize], m2

    add         r1, mmsize/4
    add         r0, 2*2*mmsize
%endrep

    dec         r2
    jnz        .pass2
    RET

;-------------------------------------------------------
; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
%if ARCH_X86_64
INIT_XMM sse2
cglobal idct8, 3, 6, 16, 0-5*mmsize
    mova        m9, [r0 + 1 * mmsize]
    mova        m1, [r0 + 3 * mmsize]
    mova        m7, m9
    punpcklwd   m7, m1
    punpckhwd   m9, m1
    mova        m14, [tab_idct8_3]
    mova        m3, m14
    pmaddwd     m14, m7
    pmaddwd     m3, m9
    mova        m0, [r0 + 5 * mmsize]
    mova        m10, [r0 + 7 * mmsize]
    mova        m2, m0
    punpcklwd   m2, m10
    punpckhwd   m0, m10
    mova        m15, [tab_idct8_3 + 1 * mmsize]
    mova        m11, [tab_idct8_3 + 1 * mmsize]
    pmaddwd     m15, m2
    mova        m4, [tab_idct8_3 + 2 * mmsize]
    pmaddwd     m11, m0
    mova        m1, [tab_idct8_3 + 2 * mmsize]
    paddd       m15, m14
    mova        m5, [tab_idct8_3 + 4 * mmsize]
    mova        m12, [tab_idct8_3 + 4 * mmsize]
    paddd       m11, m3
    mova        [rsp + 0 * mmsize], m11
    mova        [rsp + 1 * mmsize], m15
    pmaddwd     m4, m7
    pmaddwd     m1, m9
    mova        m14, [tab_idct8_3 + 3 * mmsize]
    mova        m3, [tab_idct8_3 + 3 * mmsize]
    pmaddwd     m14, m2
    pmaddwd     m3, m0
    paddd       m14, m4
    paddd       m3, m1
    mova        [rsp + 2 * mmsize], m3
    pmaddwd     m5, m9
    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
    mova        m6, [tab_idct8_3 + 5 * mmsize]
    pmaddwd     m12, m7
    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
    mova        m4, [tab_idct8_3 + 5 * mmsize]
    pmaddwd     m6, m2
    paddd       m6, m12
    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
    paddd       m7, m2
    mova        [rsp + 3 * mmsize], m6
    pmaddwd     m4, m0
    pmaddwd     m0, [tab_idct8_3 + 7 * mmsize]
    paddd       m9, m0
    paddd       m5, m4
    mova        m6, [r0 + 0 * mmsize]
    mova        m0, [r0 + 4 * mmsize]
    mova        m4, m6
    punpcklwd   m4, m0
    punpckhwd   m6, m0
    mova        m12, [r0 + 2 * mmsize]
    mova        m0, [r0 + 6 * mmsize]
    mova        m13, m12
    mova        m8, [tab_dct4]
    punpcklwd   m13, m0
    mova        m10, [tab_dct4]
    punpckhwd   m12, m0
    pmaddwd     m8, m4
    mova        m3, m8
    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
    pmaddwd     m10, m6
    mova        m2, [tab_dct4 + 1 * mmsize]
    mova        m1, m10
    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
    mova        m0, [tab_dct4 + 1 * mmsize]
    pmaddwd     m2, m13
    paddd       m3, m2
    psubd       m8, m2
    mova        m2, m6
    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
    pmaddwd     m0, m12
    paddd       m1, m0
    psubd       m10, m0
    mova        m0, m4
    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
    paddd       m3, [pd_64]
    paddd       m1, [pd_64]
    paddd       m8, [pd_64]
    paddd       m10, [pd_64]
    paddd       m0, m13
    paddd       m2, m12
    paddd       m0, [pd_64]
    paddd       m2, [pd_64]
    psubd       m4, m13
    psubd       m6, m12
    paddd       m4, [pd_64]
    paddd       m6, [pd_64]
    mova        m12, m8
    psubd       m8, m7
    psrad       m8, 7
    paddd       m15, m3
    psubd       m3, [rsp + 1 * mmsize]
    psrad       m15, 7
    paddd       m12, m7
    psrad       m12, 7
    paddd       m11, m1
    mova        m13, m14
    psrad       m11, 7
    packssdw    m15, m11
    psubd       m1, [rsp + 0 * mmsize]
    psrad       m1, 7
    mova        m11, [rsp + 2 * mmsize]
    paddd       m14, m0
    psrad       m14, 7
    psubd       m0, m13
    psrad       m0, 7
    paddd       m11, m2
    mova        m13, [rsp + 3 * mmsize]
    psrad       m11, 7
    packssdw    m14, m11
    mova        m11, m6
    psubd       m6, m5
    paddd       m13, m4
    psrad       m13, 7
    psrad       m6, 7
    paddd       m11, m5
    psrad       m11, 7
    packssdw    m13, m11
    mova        m11, m10
    psubd       m4, [rsp + 3 * mmsize]
    psubd       m10, m9
    psrad       m4, 7
    psrad       m10, 7
    packssdw    m4, m6
    packssdw    m8, m10
    paddd       m11, m9
    psrad       m11, 7
    packssdw    m12, m11
    psubd       m2, [rsp + 2 * mmsize]
    mova        m5, m15
    psrad       m2, 7
    packssdw    m0, m2
    mova        m2, m14
    psrad       m3, 7
    packssdw    m3, m1
    mova        m6, m13
    punpcklwd   m5, m8
    punpcklwd   m2, m4
    mova        m1, m12
    punpcklwd   m6, m0
    punpcklwd   m1, m3
    mova        m9, m5
    punpckhwd   m13, m0
    mova        m0, m2
    punpcklwd   m9, m6
    punpckhwd   m5, m6
    punpcklwd   m0, m1
    punpckhwd   m2, m1
    punpckhwd   m15, m8
    mova        m1, m5
    punpckhwd   m14, m4
    punpckhwd   m12, m3
    mova        m6, m9
    punpckhwd   m9, m0
    punpcklwd   m1, m2
    mova        m4, [tab_idct8_3 + 0 * mmsize]
    punpckhwd   m5, m2
    punpcklwd   m6, m0
    mova        m2, m15
    mova        m0, m14
    mova        m7, m9
    punpcklwd   m2, m13
    punpcklwd   m0, m12
    punpcklwd   m7, m5
    punpckhwd   m14, m12
    mova        m10, m2
    punpckhwd   m15, m13
    punpckhwd   m9, m5
    pmaddwd     m4, m7
    mova        m13, m1
    punpckhwd   m2, m0
    punpcklwd   m10, m0
    mova        m0, m15
    punpckhwd   m15, m14
    mova        m12, m1
    mova        m3, [tab_idct8_3 + 0 * mmsize]
    punpcklwd   m0, m14
    pmaddwd     m3, m9
    mova        m11, m2
    punpckhwd   m2, m15
    punpcklwd   m11, m15
    mova        m8, [tab_idct8_3 + 1 * mmsize]
    punpcklwd   m13, m0
    punpckhwd   m12, m0
    pmaddwd     m8, m11
    paddd       m8, m4
    mova        [rsp + 4 * mmsize], m8
    mova        m4, [tab_idct8_3 + 2 * mmsize]
    pmaddwd     m4, m7
    mova        m15, [tab_idct8_3 + 2 * mmsize]
    mova        m5, [tab_idct8_3 + 1 * mmsize]
    pmaddwd     m15, m9
    pmaddwd     m5, m2
    paddd       m5, m3
    mova        [rsp + 3 * mmsize], m5
    mova        m14, [tab_idct8_3 + 3 * mmsize]
    mova        m5, [tab_idct8_3 + 3 * mmsize]
    pmaddwd     m14, m11
    paddd       m14, m4
    mova        [rsp + 2 * mmsize], m14
    pmaddwd     m5, m2
    paddd       m5, m15
    mova        [rsp + 1 * mmsize], m5
    mova        m15, [tab_idct8_3 + 4 * mmsize]
    mova        m5, [tab_idct8_3 + 4 * mmsize]
    pmaddwd     m15, m7
    pmaddwd     m7, [tab_idct8_3 + 6 * mmsize]
    pmaddwd     m5, m9
    pmaddwd     m9, [tab_idct8_3 + 6 * mmsize]
    mova        m4, [tab_idct8_3 + 5 * mmsize]
    pmaddwd     m4, m2
    paddd       m5, m4
    mova        m4, m6
    mova        m8, [tab_idct8_3 + 5 * mmsize]
    punpckhwd   m6, m10
    pmaddwd     m2, [tab_idct8_3 + 7 * mmsize]
    punpcklwd   m4, m10
    paddd       m9, m2
    pmaddwd     m8, m11
    mova        m10, [tab_dct4]
    paddd       m8, m15
    pmaddwd     m11, [tab_idct8_3 + 7 * mmsize]
    paddd       m7, m11
    mova        [rsp + 0 * mmsize], m8
    pmaddwd     m10, m6
    pmaddwd     m6, [tab_dct4 + 2 * mmsize]
    mova        m1, m10
    mova        m8, [tab_dct4]
    mova        m3, [tab_dct4 + 1 * mmsize]
    pmaddwd     m8, m4
    pmaddwd     m4, [tab_dct4 + 2 * mmsize]
    mova        m0, m8
    mova        m2, [tab_dct4 + 1 * mmsize]
    pmaddwd     m3, m13
    psubd       m8, m3
    paddd       m0, m3
    mova        m3, m6
    pmaddwd     m13, [tab_dct4 + 3 * mmsize]
    pmaddwd     m2, m12
    paddd       m1, m2
    psubd       m10, m2
    mova        m2, m4
    pmaddwd     m12, [tab_dct4 + 3 * mmsize]
    mova        m15, [pd_ %+ IDCT_ROUND]
    paddd       m0, m15
    paddd       m1, m15
    paddd       m8, m15
    paddd       m10, m15
    paddd       m2, m13
    paddd       m3, m12
    paddd       m2, m15
    paddd       m3, m15
    psubd       m4, m13
    psubd       m6, m12
    paddd       m4, m15
    paddd       m6, m15
    mova        m15, [rsp + 4 * mmsize]
    mova        m12, m8
    psubd       m8, m7
    psrad       m8, IDCT_SHIFT
    mova        m11, [rsp + 3 * mmsize]
    paddd       m15, m0
    psrad       m15, IDCT_SHIFT
    psubd       m0, [rsp + 4 * mmsize]
    psrad       m0, IDCT_SHIFT
    paddd       m12, m7
    paddd       m11, m1
    mova        m14, [rsp + 2 * mmsize]
    psrad       m11, IDCT_SHIFT
    packssdw    m15, m11
    psubd       m1, [rsp + 3 * mmsize]
    psrad       m1, IDCT_SHIFT
    mova        m11, [rsp + 1 * mmsize]
    paddd       m14, m2
    psrad       m14, IDCT_SHIFT
    packssdw    m0, m1
    psrad       m12, IDCT_SHIFT
    psubd       m2, [rsp + 2 * mmsize]
    paddd       m11, m3
    mova        m13, [rsp + 0 * mmsize]
    psrad       m11, IDCT_SHIFT
    packssdw    m14, m11
    mova        m11, m6
    psubd       m6, m5
    paddd       m13, m4
    psrad       m13, IDCT_SHIFT
    mova        m1, m15
    paddd       m11, m5
    psrad       m11, IDCT_SHIFT
    packssdw    m13, m11
    mova        m11, m10
    psubd       m10, m9
    psrad       m10, IDCT_SHIFT
    packssdw    m8, m10
    psrad       m6, IDCT_SHIFT
    psubd       m4, [rsp + 0 * mmsize]
    paddd       m11, m9
    psrad       m11, IDCT_SHIFT
    packssdw    m12, m11
    punpcklwd   m1, m14
    mova        m5, m13
    psrad       m4, IDCT_SHIFT
    packssdw    m4, m6
    psubd       m3, [rsp + 1 * mmsize]
    psrad       m2, IDCT_SHIFT
    mova        m6, m8
    psrad       m3, IDCT_SHIFT
    punpcklwd   m5, m12
    packssdw    m2, m3
    punpcklwd   m6, m4
    punpckhwd   m8, m4
    mova        m4, m1
    mova        m3, m2
    punpckhdq   m1, m5
    punpckldq   m4, m5
    punpcklwd   m3, m0
    punpckhwd   m2, m0
    mova        m0, m6
    lea         r2, [r2 + r2]
    lea         r4, [r2 + r2]
    lea         r3, [r4 + r2]
    lea         r4, [r4 + r3]
    lea         r0, [r4 + r2 * 2]
    movq        [r1], m4
    punpckhwd   m15, m14
    movhps      [r1 + r2], m4
    punpckhdq   m0, m3
    movq        [r1 + r2 * 2], m1
    punpckhwd   m13, m12
    movhps      [r1 + r3], m1
    mova        m1, m6
    punpckldq   m1, m3
    movq        [r1 + 8], m1
    movhps      [r1 + r2 + 8], m1
    movq        [r1 + r2 * 2 + 8], m0
    movhps      [r1 + r3 + 8], m0
    mova        m0, m15
    punpckhdq   m15, m13
    punpckldq   m0, m13
    movq        [r1 + r2 * 4], m0
    movhps      [r1 + r4], m0
    mova        m0, m8
    punpckhdq   m8, m2
    movq        [r1 + r3 * 2], m15
    punpckldq   m0, m2
    movhps      [r1 + r0], m15
    movq        [r1 + r2 * 4 + 8], m0
    movhps      [r1 + r4 + 8], m0
    movq        [r1 + r3 * 2 + 8], m8
    movhps      [r1 + r0 + 8], m8
    RET
%endif

;-------------------------------------------------------
; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass1
    movh        m0, [r0]
    movhps      m0, [r0 + 2 * 16]
    movh        m1, [r0 + 4 * 16]
    movhps      m1, [r0 + 6 * 16]

    punpckhwd   m2, m0, m1                  ; [2 6]
    punpcklwd   m0, m1                      ; [0 4]
    pmaddwd     m1, m0, [r6]                ; EE[0]
    pmaddwd     m0, [r6 + 32]               ; EE[1]
    pmaddwd     m3, m2, [r6 + 16]           ; EO[0]
    pmaddwd     m2, [r6 + 48]               ; EO[1]

    paddd       m4, m1, m3                  ; E[0]
    psubd       m1, m3                      ; E[3]
    paddd       m3, m0, m2                  ; E[1]
    psubd       m0, m2                      ; E[2]

    ;E[K] = E[k] + add
    mova        m5, [pd_64]
    paddd       m0, m5
    paddd       m1, m5
    paddd       m3, m5
    paddd       m4, m5

    movh        m2, [r0 + 16]
    movhps      m2, [r0 + 5 * 16]
    movh        m5, [r0 + 3 * 16]
    movhps      m5, [r0 + 7 * 16]
    punpcklwd   m6, m2, m5                  ;[1 3]
    punpckhwd   m2, m5                      ;[5 7]

    pmaddwd     m5, m6, [r4]
    pmaddwd     m7, m2, [r4 + 16]
    paddd       m5, m7                      ; O[0]

    paddd       m7, m4, m5
    psrad       m7, 7

    psubd       m4, m5
    psrad       m4, 7

    packssdw    m7, m4
    movh        [r5 + 0 * 16], m7
    movhps      [r5 + 7 * 16], m7

    pmaddwd     m5, m6, [r4 + 32]
    pmaddwd     m4, m2, [r4 + 48]
    paddd       m5, m4                      ; O[1]

    paddd       m4, m3, m5
    psrad       m4, 7

    psubd       m3, m5
    psrad       m3, 7

    packssdw    m4, m3
    movh        [r5 + 1 * 16], m4
    movhps      [r5 + 6 * 16], m4

    pmaddwd     m5, m6, [r4 + 64]
    pmaddwd     m4, m2, [r4 + 80]
    paddd       m5, m4                      ; O[2]

    paddd       m4, m0, m5
    psrad       m4, 7

    psubd       m0, m5
    psrad       m0, 7

    packssdw    m4, m0
    movh        [r5 + 2 * 16], m4
    movhps      [r5 + 5 * 16], m4

    pmaddwd     m5, m6, [r4 + 96]
    pmaddwd     m4, m2, [r4 + 112]
    paddd       m5, m4                      ; O[3]

    paddd       m4, m1, m5
    psrad       m4, 7

    psubd       m1, m5
    psrad       m1, 7

    packssdw    m4, m1
    movh        [r5 + 3 * 16], m4
    movhps      [r5 + 4 * 16], m4

    ret

%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
    pshufb      m4, %1, [pb_idct8even]
    pmaddwd     m4, [tab_idct8_1]
    phsubd      m5, m4
    pshufd      m4, m4, 0x4E
    phaddd      m4, m4
    punpckhqdq  m4, m5                      ;m4 = dd e[ 0 1 2 3]
    paddd       m4, m6

    pshufb      %1, %1, [r6]
    pmaddwd     m5, %1, [r4]
    pmaddwd     %1, [r4 + 16]
    phaddd      m5, %1                      ; m5 = dd O[0, 1, 2, 3]

    paddd       %1, m4, m5
    psrad       %1, IDCT_SHIFT

    psubd       m4, m5
    psrad       m4, IDCT_SHIFT
    pshufd      m4, m4, 0x1B

    packssdw    %1, m4
%endmacro

INIT_XMM ssse3
cglobal patial_butterfly_inverse_internal_pass2
    mova        m0, [r5]
    PARTIAL_BUTTERFLY_PROCESS_ROW m0
    movu        [r1], m0

    mova        m2, [r5 + 16]
    PARTIAL_BUTTERFLY_PROCESS_ROW m2
    movu        [r1 + r2], m2

    mova        m1, [r5 + 32]
    PARTIAL_BUTTERFLY_PROCESS_ROW m1
    movu        [r1 + 2 * r2], m1

    mova        m3, [r5 + 48]
    PARTIAL_BUTTERFLY_PROCESS_ROW m3
    movu        [r1 + r3], m3
    ret

INIT_XMM ssse3
cglobal idct8, 3,7,8 ;,0-16*mmsize
    ; alignment stack to 64-bytes
    mov         r5, rsp
    sub         rsp, 16*mmsize + gprsize
    and         rsp, ~(64-1)
    mov         [rsp + 16*mmsize], r5
    mov         r5, rsp

    lea         r4, [tab_idct8_3]
    lea         r6, [tab_dct4]

    call        patial_butterfly_inverse_internal_pass1

    add         r0, 8
    add         r5, 8

    call        patial_butterfly_inverse_internal_pass1

    mova        m6, [pd_ %+ IDCT_ROUND]
    add         r2, r2
    lea         r3, [r2 * 3]
    lea         r4, [tab_idct8_2]
    lea         r6, [pb_idct8odd]
    sub         r5, 8

    call        patial_butterfly_inverse_internal_pass2

    lea         r1, [r1 + 4 * r2]
    add         r5, 64

    call        patial_butterfly_inverse_internal_pass2

    ; restore origin stack pointer
    mov         rsp, [rsp + 16*mmsize]
    RET


;-----------------------------------------------------------------------------
; void denoise_dct(int16_t* dct, uint32_t* sum, uint16_t* offset, int size)
;-----------------------------------------------------------------------------
INIT_XMM sse4
cglobal denoise_dct, 4, 4, 6
    pxor     m5,  m5
    shr      r3d, 3
.loop:
    movu     m0, [r0]
    pabsw    m1, m0
    movu     m2, [r1]
    pmovsxwd m3, m1
    paddd    m2, m3
    movu     [r1], m2
    movu     m2, [r1 + 16]
    psrldq   m3, m1, 8
    pmovsxwd m4, m3
    paddd    m2, m4
    movu     [r1 + 16], m2

    movu     m3, [r2]
    psubusw  m1, m3
    pcmpgtw  m4, m1, m5
    pand     m1, m4
    psignw   m1, m0
    movu     [r0], m1
    add      r0, 16
    add      r1, 32
    add      r2, 16
    dec      r3d
    jnz .loop
    RET

INIT_YMM avx2
cglobal denoise_dct, 4, 4, 6
    pxor     m5,  m5
    shr      r3d, 4
.loop:
    movu     m0, [r0]
    pabsw    m1, m0
    movu     m2, [r1]
    pmovsxwd m4, xm1
    paddd    m2, m4
    movu     [r1], m2
    vextracti128 xm4, m1, 1
    movu     m2, [r1 + 32]
    pmovsxwd m3, xm4
    paddd    m2, m3
    movu     [r1 + 32], m2
    movu     m3, [r2]
    psubusw  m1, m3
    pcmpgtw  m4, m1, m5
    pand     m1, m4
    psignw   m1, m0
    movu     [r0], m1
    add      r0, 32
    add      r1, 64
    add      r2, 32
    dec      r3d
    jnz .loop
    RET
%if ARCH_X86_64 == 1
INIT_ZMM avx512
cglobal denoise_dct, 4, 4, 22
    pxor     m16,  m16
    sub      r3d,   16
    je       .coeff16
    add      r3d,   16
    shr      r3d,    5
    jmp      .loop

.coeff16:
    movu          ym19,  [r0]
    pabsw         ym17, ym19
    movu            m2, [r1]
    pmovsxwd       m18, ym17
    paddd           m2,  m18
    movu          [r1],   m2
    movu           ym3, [r2]
    psubusw       ym17, ym3
    pcmpgtw       ym18, ym17, ym16
    pand          ym17, ym18
    psignw        ym17, ym19
    movu          [r0], ym17
    RET

.loop:
    movu          m21, [r0]
    pabsw         m17, m21
    movu           m2, [r1]
    pmovsxwd       m4, ym17
    paddd          m2,  m4
    movu         [r1],  m2
    vextracti64x4 ym4, m17, 1

    movu           m2, [r1 + mmsize]
    pmovsxwd       m3, ym4
    paddd          m2, m3
    movu           [r1 + mmsize], m2
    movu           m3, [r2]
    psubusw       m17, m3

    vextracti64x4 ym20,  m17,    1
    pcmpgtw       ym18, ym17, ym16
    pcmpgtw       ym19, ym20, ym16
    vinserti64x4   m18,  m18, ym19, 1

    pand           m17,  m18
    vextracti64x4 ym19,  m17, 1
    vextracti64x4 ym20,  m21, 1
    psignw        ym17, ym21
    psignw        ym19, ym20
    vinserti64x4   m17,  m17, ym19, 1

    movu          [r0],  m17
    add             r0,  mmsize
    add             r1,  mmsize * 2
    add             r2,  mmsize
    dec             r3d
    jnz             .loop
    RET
%endif ; ARCH_X86_64 == 1

%if ARCH_X86_64 == 1
%macro DCT8_PASS_1 4
    vpbroadcastq    m0,                 [r6 + %1]
    pmaddwd         m2,                 m%3, m0
    pmaddwd         m0,                 m%4
    phaddd          m2,                 m0
    paddd           m2,                 m5
    psrad           m2,                 DCT8_SHIFT1
    packssdw        m2,                 m2
    vpermq          m2,                 m2, 0x08
    mova            [r5 + %2],          xm2
%endmacro

%macro DCT8_PASS_2 2
    vbroadcasti128  m4,                 [r6 + %1]
    pmaddwd         m6,                 m0, m4
    pmaddwd         m7,                 m1, m4
    pmaddwd         m8,                 m2, m4
    pmaddwd         m9,                 m3, m4
    phaddd          m6,                 m7
    phaddd          m8,                 m9
    phaddd          m6,                 m8
    paddd           m6,                 m5
    psrad           m6,                 DCT8_SHIFT2

    vbroadcasti128  m4,                 [r6 + %2]
    pmaddwd         m10,                m0, m4
    pmaddwd         m7,                 m1, m4
    pmaddwd         m8,                 m2, m4
    pmaddwd         m9,                 m3, m4
    phaddd          m10,                m7
    phaddd          m8,                 m9
    phaddd          m10,                m8
    paddd           m10,                m5
    psrad           m10,                DCT8_SHIFT2

    packssdw        m6,                 m10
    vpermq          m10,                m6, 0xD8

%endmacro

INIT_YMM avx2
cglobal dct8, 3, 7, 11, 0-8*16
vbroadcasti128      m5,                [pd_ %+ DCT8_ROUND1]
%define             DCT_SHIFT2         9

    add             r2d,               r2d
    lea             r3,                [r2 * 3]
    lea             r4,                [r0 + r2 * 4]
    mov             r5,                rsp
    lea             r6,                [tab_dct8]
    mova            m6,                [dct8_shuf]

    ;pass1
    mova            xm0,               [r0]
    vinserti128     m0,                m0, [r4], 1
    mova            xm1,               [r0 + r2]
    vinserti128     m1,                m1, [r4 + r2], 1
    mova            xm2,               [r0 + r2 * 2]
    vinserti128     m2,                m2, [r4 + r2 * 2], 1
    mova            xm3,               [r0 + r3]
    vinserti128     m3,                m3,  [r4 + r3], 1

    punpcklqdq      m4,                m0, m1
    punpckhqdq      m0,                m1
    punpcklqdq      m1,                m2, m3
    punpckhqdq      m2,                m3

    pshufb          m0,                m6
    pshufb          m2,                m6

    paddw           m3,                m4, m0
    paddw           m7,                m1, m2

    psubw           m4,                m0
    psubw           m1,                m2

    DCT8_PASS_1     0 * 16,             0 * 16, 3, 7
    DCT8_PASS_1     1 * 16,             2 * 16, 4, 1
    DCT8_PASS_1     2 * 16,             4 * 16, 3, 7
    DCT8_PASS_1     3 * 16,             6 * 16, 4, 1
    DCT8_PASS_1     4 * 16,             1 * 16, 3, 7
    DCT8_PASS_1     5 * 16,             3 * 16, 4, 1
    DCT8_PASS_1     6 * 16,             5 * 16, 3, 7
    DCT8_PASS_1     7 * 16,             7 * 16, 4, 1

    ;pass2
    vbroadcasti128  m5,                [pd_ %+ DCT8_ROUND2]

    mova            m0,                [r5]
    mova            m1,                [r5 + 32]
    mova            m2,                [r5 + 64]
    mova            m3,                [r5 + 96]

    DCT8_PASS_2     0 * 16, 1 * 16
    movu            [r1],              m10
    DCT8_PASS_2     2 * 16, 3 * 16
    movu            [r1 + 32],         m10
    DCT8_PASS_2     4 * 16, 5 * 16
    movu            [r1 + 64],         m10
    DCT8_PASS_2     6 * 16, 7 * 16
    movu            [r1 + 96],         m10
    RET


%macro DCT8_AVX512_PASS_1 4
    vpmaddwd        m%2,               m3, m%1
    vpsrlq          m8,                m%2, 32
    vpaddd          m%2,               m8
    vpaddd          m%2,               m5
    vpsrad          m%2,               DCT8_SHIFT1

    vpmaddwd        m%4,               m2, m%3
    vpsrlq          m8,                m%4, 32
    vpaddd          m%4,               m8
    vpaddd          m%4,               m5
    vpsrad          m%4,               DCT8_SHIFT1

    vpackssdw       m%2,               m%4
    vpermw          m%2,               m1, m%2
%endmacro

%macro DCT8_AVX512_PASS_2 4
    vpmaddwd         m0,               m9,  m%1
    vpmaddwd         m1,               m10, m%1
    vpsrldq          m2,               m0,  8
    vpsrldq          m3,               m1,  8
    vpaddd           m0,               m2
    vpaddd           m1,               m3
    vpsrlq           m2,               m0,  32
    vpsrlq           m3,               m1,  32
    vpaddd           m0,               m2
    vpaddd           m1,               m3
    vpaddd           m0,               m5
    vpsrad           m0,               DCT8_SHIFT2
    vpaddd           m1,               m5
    vpsrad           m1,               DCT8_SHIFT2
    vpackssdw        m0,               m1
    vpermw           m0,               m19, m0

    vpmaddwd         m1,               m9,  m%2
    vpmaddwd         m2,               m10, m%2
    vpsrldq          m3,               m1,  8
    vpsrldq          m4,               m2,  8
    vpaddd           m1,               m3
    vpaddd           m2,               m4
    vpsrlq           m3,               m1,  32
    vpsrlq           m4,               m2,  32
    vpaddd           m1,               m3
    vpaddd           m2,               m4
    vpaddd           m1,               m5
    vpsrad           m1,               DCT8_SHIFT2
    vpaddd           m2,               m5
    vpsrad           m2,               DCT8_SHIFT2
    vpackssdw        m1,               m2
    vpermw           m1,               m19, m1
    vinserti128      ym0,              ym0, xm1, 1

    vpmaddwd         m1,               m9,  m%3
    vpmaddwd         m2,               m10, m%3
    vpsrldq          m3,               m1,  8
    vpsrldq          m4,               m2,  8
    vpaddd           m1,               m3
    vpaddd           m2,               m4
    vpsrlq           m3,               m1,  32
    vpsrlq           m4,               m2,  32
    vpaddd           m1,               m3
    vpaddd           m2,               m4
    vpaddd           m1,               m5
    vpsrad           m1,               DCT8_SHIFT2
    vpaddd           m2,               m5
    vpsrad           m2,               DCT8_SHIFT2
    vpackssdw        m1,               m2
    vpermw           m1,               m19, m1

    vpmaddwd         m2,               m9,  m%4
    vpmaddwd         m3,               m10, m%4
    vpsrldq          m4,               m2,  8
    vpsrldq          m6,               m3,  8
    vpaddd           m2,               m4
    vpaddd           m3,               m6
    vpsrlq           m4,               m2,  32
    vpsrlq           m6,               m3,  32
    vpaddd           m2,               m4
    vpaddd           m3,               m6
    vpaddd           m2,               m5
    vpsrad           m2,               DCT8_SHIFT2
    vpaddd           m3,               m5
    vpsrad           m3,               DCT8_SHIFT2
    vpackssdw        m2,               m3
    vpermw           m2,               m19, m2

    vinserti128      ym1,              ym1, xm2, 1
    vinserti64x4     m0,               m0, ym1, 1
%endmacro

INIT_ZMM avx512
cglobal dct8, 3, 7, 24

    vbroadcasti32x4  m5,               [pd_ %+ DCT8_ROUND1]
    vbroadcasti32x8  m4,               [dct8_shuf]
    vbroadcasti32x4  m19,              [dct8_shuf9_AVX512]

    add              r2d,              r2d
    lea              r3,               [r2 * 3]
    lea              r4,               [r0 + r2 * 4]
    lea              r5,               [tab_dct8]
    lea              r6,               [tab_dct8_avx512]

    ;pass1
    mova            xm0,               [r0]
    vinserti128     ym0,               ym0, [r4], 1
    mova            xm1,               [r0 + r2]
    vinserti128     ym1,               ym1, [r4 + r2], 1
    mova            xm2,               [r0 + r2 * 2]
    vinserti128     ym2,               ym2, [r4 + r2 * 2], 1
    mova            xm3,               [r0 + r3]
    vinserti128     ym3,               ym3,  [r4 + r3], 1

    vinserti64x4    m0,                m0, ym2, 1
    vinserti64x4    m1,                m1, ym3, 1

    vpunpcklqdq     m2,                m0, m1
    vpunpckhqdq     m0,                m1

    vpshufb         m0,                m4
    vpaddw          m3,                m2, m0
    vpsubw          m2,                m0

    vbroadcasti32x8 m1,                [dct8_shuf7_AVX512]

    ; Load all the coefficients togather for better caching
    vpbroadcastq    m20,               [r6 + 0 * 8]
    vpbroadcastq    m21,               [r6 + 1 * 8]
    vpbroadcastq    m22,               [r6 + 2 * 8]
    vpbroadcastq    m23,               [r6 + 3 * 8]
    vpbroadcastq    m7,                [r6 + 4 * 8]
    vpbroadcastq    m12,               [r6 + 5 * 8]
    vpbroadcastq    m14,               [r6 + 6 * 8]
    vpbroadcastq    m16,               [r6 + 7 * 8]

    DCT8_AVX512_PASS_1     20,       9, 21,      10
    DCT8_AVX512_PASS_1     22,      11, 23,      10
    DCT8_AVX512_PASS_1     7,       13, 12,      10
    DCT8_AVX512_PASS_1     14,      15, 16,      10

    ;pass2
    vbroadcasti32x4        m5,          [pd_ %+ DCT8_ROUND2]

    vinserti64x4           m9,          m9,  ym11, 1
    vinserti64x4           m10,         m13, ym15, 1

    ;Load all the coefficients togather for better caching and reuse common coefficients from PASS 1
    vbroadcasti32x4    m21,                [r5 + 1 * 16]
    vbroadcasti32x4    m22,                [r5 + 2 * 16]
    vbroadcasti32x4    m23,                [r5 + 3 * 16]
    vbroadcasti32x4    m12,                [r5 + 5 * 16]
    vbroadcasti32x4    m14,                [r5 + 6 * 16]
    vbroadcasti32x4    m16,                [r5 + 7 * 16]

    DCT8_AVX512_PASS_2     20, 21, 22, 23
    movu                   [r1],        m0
    DCT8_AVX512_PASS_2     7, 12, 14, 16
    movu                   [r1 + 64],   m0
    RET

%macro DCT16_PASS_1_E 2
    vpbroadcastq    m7,                [r7 + %1]

    pmaddwd         m4,                m0, m7
    pmaddwd         m6,                m2, m7
    phaddd          m4,                m6

    paddd           m4,                m9
    psrad           m4,                DCT_SHIFT

    packssdw        m4,                m4
    vpermq          m4,                m4, 0x08

    mova            [r5 + %2],         xm4
%endmacro

%macro DCT16_PASS_1_O 2
    vbroadcasti128  m7,                [r7 + %1]

    pmaddwd         m10,               m0, m7
    pmaddwd         m11,               m2, m7
    phaddd          m10,               m11                 ; [d0 d0 d1 d1 d4 d4 d5 d5]

    pmaddwd         m11,               m4, m7
    pmaddwd         m12,               m6, m7
    phaddd          m11,               m12                 ; [d2 d2 d3 d3 d6 d6 d7 d7]

    phaddd          m10,               m11                 ; [d0 d1 d2 d3 d4 d5 d6 d7]

    paddd           m10,               m9
    psrad           m10,               DCT_SHIFT

    packssdw        m10,               m10                 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
    vpermq          m10,               m10, 0x08

    mova            [r5 + %2],         xm10
%endmacro

%macro DCT16_PASS_2 2
    vbroadcasti128  m8,                [r7 + %1]
    vbroadcasti128  m13,               [r8 + %1]

    pmaddwd         m10,               m0, m8
    pmaddwd         m11,               m1, m13
    paddd           m10,               m11

    pmaddwd         m11,               m2, m8
    pmaddwd         m12,               m3, m13
    paddd           m11,               m12
    phaddd          m10,               m11

    pmaddwd         m11,               m4, m8
    pmaddwd         m12,               m5, m13
    paddd           m11,               m12

    pmaddwd         m12,               m6, m8
    pmaddwd         m13,               m7, m13
    paddd           m12,               m13
    phaddd          m11,               m12

    phaddd          m10,               m11
    paddd           m10,               m9
    psrad           m10,               DCT_SHIFT2


    vbroadcasti128  m8,                [r7 + %2]
    vbroadcasti128  m13,               [r8 + %2]

    pmaddwd         m14,               m0, m8
    pmaddwd         m11,               m1, m13
    paddd           m14,               m11

    pmaddwd         m11,               m2, m8
    pmaddwd         m12,               m3, m13
    paddd           m11,               m12
    phaddd          m14,               m11

    pmaddwd         m11,               m4, m8
    pmaddwd         m12,               m5, m13
    paddd           m11,               m12

    pmaddwd         m12,               m6, m8
    pmaddwd         m13,               m7, m13
    paddd           m12,               m13
    phaddd          m11,               m12

    phaddd          m14,               m11
    paddd           m14,               m9
    psrad           m14,               DCT_SHIFT2

    packssdw        m10,               m14
    vextracti128    xm14,              m10,       1
    movlhps         xm15,              xm10,      xm14
    movhlps         xm14,              xm10
%endmacro
INIT_YMM avx2
cglobal dct16, 3, 9, 16, 0-16*mmsize
%if BIT_DEPTH == 12
    %define         DCT_SHIFT          7
    vbroadcasti128  m9,                [pd_64]
%elif BIT_DEPTH == 10
    %define         DCT_SHIFT          5
    vbroadcasti128  m9,                [pd_16]
%elif BIT_DEPTH == 8
    %define         DCT_SHIFT          3
    vbroadcasti128  m9,                [pd_4]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         10

    add             r2d,               r2d

    mova            m13,               [dct16_shuf1]
    mova            m14,               [dct16_shuf2]
    lea             r7,                [tab_dct16_1 + 8 * 16]
    lea             r8,                [tab_dct16_2 + 8 * 16]
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r4d,               2                   ; Each iteration process 8 rows, so 16/8 iterations

.pass1:
    lea             r6,                [r0 + r2 * 4]

    movu            m2,                [r0]
    movu            m1,                [r6]
    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row4lo]
    vperm2i128      m1,                m2, m1, 0x31        ; [row0hi  row4hi]

    movu            m4,                [r0 + r2]
    movu            m3,                [r6 + r2]
    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row5lo]
    vperm2i128      m3,                m4, m3, 0x31        ; [row1hi  row5hi]

    movu            m6,                [r0 + r2 * 2]
    movu            m5,                [r6 + r2 * 2]
    vperm2i128      m4,                m6, m5, 0x20        ; [row2lo  row6lo]
    vperm2i128      m5,                m6, m5, 0x31        ; [row2hi  row6hi]

    movu            m8,                [r0 + r3]
    movu            m7,                [r6 + r3]
    vperm2i128      m6,                m8, m7, 0x20        ; [row3lo  row7lo]
    vperm2i128      m7,                m8, m7, 0x31        ; [row3hi  row7hi]

    pshufb          m1,                m13
    pshufb          m3,                m13
    pshufb          m5,                m13
    pshufb          m7,                m13

    paddw           m8,                m0, m1              ;E
    psubw           m0,                m1                  ;O

    paddw           m1,                m2, m3              ;E
    psubw           m2,                m3                  ;O

    paddw           m3,                m4, m5              ;E
    psubw           m4,                m5                  ;O

    paddw           m5,                m6, m7              ;E
    psubw           m6,                m7                  ;O

    DCT16_PASS_1_O  -7 * 16,           1 * 32
    DCT16_PASS_1_O  -5 * 16,           3 * 32
    DCT16_PASS_1_O  -3 * 16,           1 * 32 + 16
    DCT16_PASS_1_O  -1 * 16,           3 * 32 + 16
    DCT16_PASS_1_O  1 * 16,            5 * 32
    DCT16_PASS_1_O  3 * 16,            7 * 32
    DCT16_PASS_1_O  5 * 16,            5 * 32 + 16
    DCT16_PASS_1_O  7 * 16,            7 * 32 + 16

    pshufb          m8,                m14
    pshufb          m1,                m14
    phaddw          m0,                m8, m1

    pshufb          m3,                m14
    pshufb          m5,                m14
    phaddw          m2,                m3, m5

    DCT16_PASS_1_E  -8 * 16,           0 * 32
    DCT16_PASS_1_E  -4 * 16,           0 * 32 + 16
    DCT16_PASS_1_E  0 * 16,            4 * 32
    DCT16_PASS_1_E  4 * 16,            4 * 32 + 16

    phsubw          m0,                m8, m1
    phsubw          m2,                m3, m5

    DCT16_PASS_1_E  -6 * 16,           2 * 32
    DCT16_PASS_1_E  -2 * 16,           2 * 32 + 16
    DCT16_PASS_1_E  2 * 16,            6 * 32
    DCT16_PASS_1_E  6 * 16,            6 * 32 + 16

    lea             r0,                [r0 + 8 * r2]
    add             r5,                256

    dec             r4d
    jnz             .pass1

    mov             r5,                rsp
    mov             r4d,               2
    mov             r2d,               32
    lea             r3,                [r2 * 3]
    vbroadcasti128  m9,                [pd_512]

.pass2:
    mova            m0,                [r5 + 0 * 32]        ; [row0lo  row4lo]
    mova            m1,                [r5 + 8 * 32]        ; [row0hi  row4hi]

    mova            m2,                [r5 + 1 * 32]        ; [row1lo  row5lo]
    mova            m3,                [r5 + 9 * 32]        ; [row1hi  row5hi]

    mova            m4,                [r5 + 2 * 32]        ; [row2lo  row6lo]
    mova            m5,                [r5 + 10 * 32]       ; [row2hi  row6hi]

    mova            m6,                [r5 + 3 * 32]        ; [row3lo  row7lo]
    mova            m7,                [r5 + 11 * 32]       ; [row3hi  row7hi]

    DCT16_PASS_2    -8 * 16, -7 * 16
    movu            [r1],              xm15
    movu            [r1 + r2],         xm14

    DCT16_PASS_2    -6 * 16, -5 * 16
    movu            [r1 + r2 * 2],     xm15
    movu            [r1 + r3],         xm14

    lea             r6,                [r1 + r2 * 4]
    DCT16_PASS_2    -4 * 16, -3 * 16
    movu            [r6],              xm15
    movu            [r6 + r2],         xm14

    DCT16_PASS_2    -2 * 16, -1 * 16
    movu            [r6 + r2 * 2],     xm15
    movu            [r6 + r3],         xm14

    lea             r6,                [r6 + r2 * 4]
    DCT16_PASS_2    0 * 16, 1 * 16
    movu            [r6],              xm15
    movu            [r6 + r2],         xm14

    DCT16_PASS_2    2 * 16, 3 * 16
    movu            [r6 + r2 * 2],     xm15
    movu            [r6 + r3],         xm14

    lea             r6,                [r6 + r2 * 4]
    DCT16_PASS_2    4 * 16, 5 * 16
    movu            [r6],              xm15
    movu            [r6 + r2],         xm14

    DCT16_PASS_2    6 * 16, 7 * 16
    movu            [r6 + r2 * 2],     xm15
    movu            [r6 + r3],         xm14

    add             r1,                16
    add             r5,                128

    dec             r4d
    jnz             .pass2
    RET
%macro DCT16_avx512_PASS_1_O 4
    vbroadcasti32x4  m1,               [r5 + %1]

    pmaddwd          m3,               m6,  m1
    vpsrldq          m11,              m3,  8
    vpaddd           m3,               m11

    pmaddwd          m11,              m8,  m1
    vpsrldq          m12,              m11, 8
    vpaddd           m11,              m12

    vpunpcklqdq      m12,              m3, m11
    vpsrldq          m11,              m12, 4
    vpaddd           m11,              m12

    pmaddwd          m3,               m10, m1
    vpsrldq          m12,              m3,  8
    vpaddd           m3,               m12

    pmaddwd          m12,              m2,  m1
    vpsrldq          m13,              m12, 8
    vpaddd           m12,              m13

    vpunpcklqdq      m13,              m3, m12
    vpsrldq          m12,              m13, 4
    vpaddd           m12,              m13

    mova             m%3,              m26
    vpermi2d         m%3,              m11, m12
    paddd            m%3,              m0
    psrad            m%3,              DCT_SHIFT

    ; next row start
    vbroadcasti32x4  m1,               [r5 + %2]

    pmaddwd          m3,               m6,  m1
    vpsrldq          m11,              m3,  8
    vpaddd           m3,               m11

    pmaddwd          m11,              m8,  m1
    vpsrldq          m12,              m11, 8
    vpaddd           m11,              m12

    vpunpcklqdq      m12,              m3, m11
    vpsrldq          m11,              m12, 4
    vpaddd           m11,              m12

    pmaddwd          m3,               m10, m1
    vpsrldq          m12,              m3,  8
    vpaddd           m3,               m12

    pmaddwd          m12,              m2,  m1
    vpsrldq          m13,              m12, 8
    vpaddd           m12,              m13

    vpunpcklqdq      m13,              m3, m12
    vpsrldq          m12,              m13, 4
    vpaddd           m12,              m13

    mova             m%4,              m26
    vpermi2d         m%4,              m11, m12
    paddd            m%4,              m0
    psrad            m%4,              DCT_SHIFT
   ;next row end

    packssdw         m%3,              m%4
    vpermw           m%4,              m25, m%3
%endmacro

%macro DCT16_AVX512_PASS_1_LOOP 0
    vbroadcasti32x8 m1,                [dct16_shuf1]
    mova            m2,                [dct16_shuf3_AVX512]
    mova            m3,                [dct16_shuf4_AVX512]

    movu            ym4,               [r0]
    movu            ym5,               [r0 + r2]
    vinserti64x4    m4,                m4, ym5, 1

    movu            ym5,               [r0 + 2 * r2]
    movu            ym6,               [r0 + r3]
    vinserti64x4    m5,                m5, ym6, 1

    mova            m6,                m2
    mova            m7,                m3
    vpermi2q        m6,                m4, m5
    vpermi2q        m7,                m4, m5

    movu            ym4,               [r4]
    movu            ym5,               [r4 + r2]
    vinserti64x4    m4,                m4, ym5, 1

    movu            ym5,               [r4 + 2 * r2]
    movu            ym8,               [r4 + r3]
    vinserti64x4    m5,                m5, ym8, 1

    mova            m8,                m2
    mova            m9,                m3
    vpermi2q        m8,                m4, m5
    vpermi2q        m9,                m4, m5

    vpshufb         m7,                m1
    vpshufb         m9,                m1

    paddw           m4,                m6, m7
    psubw           m6,                m7

    paddw           m5,                m8, m9
    psubw           m8,                m9

    lea             r0,                [r0 + 8 * r2]
    lea             r4,                [r0 + r2 * 4]

    movu            ym7,               [r0]
    movu            ym9,               [r0 + r2]
    vinserti64x4    m7,                m7, ym9, 1

    movu            ym9,               [r0 + 2 * r2]
    movu            ym10,              [r0 + r3]
    vinserti64x4    m9,                m9, ym10, 1

    mova            m10,               m2
    mova            m11,               m3
    vpermi2q        m10,               m7, m9
    vpermi2q        m11,               m7, m9

    vpshufb         m11,               m1
    paddw           m7,                m10, m11
    psubw           m10,               m11

    movu            ym9,               [r4]
    movu            ym11,              [r4 + r2]
    vinserti64x4    m9,                m9, ym11, 1

    movu            ym11,              [r4 + 2 * r2]
    movu            ym12,              [r4 + r3]
    vinserti64x4    m11,               m11, ym12, 1

    vpermi2q        m2,                m9, m11
    vpermi2q        m3,                m9, m11

    vpshufb         m3,                m1
    paddw           m9,                m2, m3
    psubw           m2,                m3
%endmacro

%macro DCT16_avx512_PASS_1_E 4
    vpbroadcastq      m1,              [r5 + %1]

    pmaddwd          m19,              m11,  m1
    vpsrldq          m12,              m19,  4
    vpaddd           m12,              m19

    pmaddwd          m19,              m13,  m1
    vpsrldq          m18,              m19,  4
    vpaddd           m18,              m19

    mova             m%2,              m27
    vpermi2d         m%2,              m12, m18
    paddd            m%2,              m0
    psrad            m%2,              DCT_SHIFT

    ; 2nd row
    vpbroadcastq      m1,              [r5 + %3]

    pmaddwd          m19,              m11,  m1
    vpsrldq          m12,              m19,  4
    vpaddd           m12,              m19

    pmaddwd          m19,              m13,  m1
    vpsrldq          m18,              m19,  4
    vpaddd           m18,              m19

    mova             m%4,              m27
    vpermi2d         m%4,              m12, m18
    paddd            m%4,              m0
    psrad            m%4,              DCT_SHIFT

    packssdw         m%2,              m%4
    vpermw           m%4,              m25, m%2
%endmacro

%macro DCT16_PASS2_AVX512 10
    vpmaddwd         m5,   m%2, m%1
    vpsrldq          m6,   m5,  8
    vpaddd           m5,   m6
    vpsrldq          m6,   m5,  4
    vpaddd           m5,   m6

    vpmaddwd         m6,   m%3, m%1
    vpsrldq          m7,   m6,  8
    vpaddd           m6,   m7
    vpsrldq          m7,   m6,  4
    vpaddd           m6,   m7
    vpunpckldq       m7,   m5, m6

    vpmaddwd         m5,   m%4, m%1
    vpsrldq          m6,   m5,  8
    vpaddd           m5,   m6
    vpsrldq          m6,   m5,  4
    vpaddd           m5,   m6

    vpmaddwd         m6,   m%5, m%1
    vpsrldq          m8,   m6,  8
    vpaddd           m6,   m8
    vpsrldq          m8,   m6,  4
    vpaddd           m6,   m8
    vpunpckldq       m8,   m5, m6

    vpunpcklqdq      m5,   m7, m8
    vpermd           m5,   m2, m5
    vpsrldq          m6,   m5,  4
    vpaddd           m5,   m6

    vpmaddwd         m6,   m%6, m%1
    vpsrldq          m7,   m6,  8
    vpaddd           m6,   m7
    vpsrldq          m7,   m6,  4
    vpaddd           m6,   m7

    vpmaddwd         m7,   m%7, m%1
    vpsrldq          m8,   m7,  8
    vpaddd           m7,   m8
    vpsrldq          m8,   m7,  4
    vpaddd           m7,   m8
    vpunpckldq       m8,   m6, m7

    vpmaddwd         m6,   m%8, m%1
    vpsrldq          m7,   m6,  8
    vpaddd           m6,   m7
    vpsrldq          m7,   m6,  4
    vpaddd           m6,   m7

    vpmaddwd         m7,   m%9, m%1
    vpsrldq          m4,   m7,  8
    vpaddd           m7,   m4
    vpsrldq          m4,   m7,  4
    vpaddd           m7,   m4
    vpunpckldq       m4,   m6, m7

    vpunpcklqdq      m6,   m8, m4
    vpermd           m6,   m2, m6
    vpsrldq          m7,   m6,  4
    vpaddd           m6,   m7

    paddd            m5, m0
    psrad            m5, DCT_SHIFT2
    paddd            m6, m0
    psrad            m6, DCT_SHIFT2

    packssdw         m5, m6
    vpermw           m%10, m3, m5
%endmacro

INIT_ZMM avx512
cglobal dct16, 3, 6, 29

%if BIT_DEPTH == 12
    %define          DCT_SHIFT          7
    vbroadcasti32x4  m0,                [pd_64]
%elif BIT_DEPTH == 10
    %define          DCT_SHIFT          5
    vbroadcasti32x4  m0,                [pd_16]
%elif BIT_DEPTH == 8
    %define          DCT_SHIFT          3
    vbroadcasti32x4  m0,                [pd_4]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         10

    add             r2d,               r2d
    lea             r3,                [r2 * 3]
    lea             r4,                [r0 + r2 * 4]
    lea             r5,                [tab_dct16_1 + 8 * 16]

    ;Load reuseable table once to save memory movments
    mova             m25,              [dct16_shuf5_AVX512]
    mova             m26,              [dct16_shuf2_AVX512]
    mova             m27,              [dct16_shuf7_AVX512]
    vbroadcasti32x8  m28,              [dct16_shuf6_AVX512]

    DCT16_AVX512_PASS_1_LOOP
    DCT16_avx512_PASS_1_O              -7 * 16, -5 * 16, 15, 14    ;row 1,   3
    DCT16_avx512_PASS_1_O              -3 * 16, -1 * 16, 16, 15    ;row 5,   7
    DCT16_avx512_PASS_1_O               1 * 16,  3 * 16, 17, 16    ;row 9,  11
    DCT16_avx512_PASS_1_O               5 * 16,  7 * 16, 18, 17    ;row 13, 15

    vbroadcasti32x8 m1,                [dct16_shuf2]
    pshufb          m4,                m1
    pshufb          m5,                m1
    pshufb          m7,                m1
    pshufb          m9,                m1

    vpsrldq          m3,              m4,  2
    vpsubw           m11,             m4,  m3
    vpsrldq          m6,              m5,  2
    vpsubw           m12,             m5,  m6
    vpsrldq          m8,              m7,  2
    vpsubw           m13,             m7,  m8
    vpsrldq          m10,             m9,  2
    vpsubw           m18,             m9,  m10

    vpermw           m11,             m28, m11
    vpermw           m12,             m28, m12
    vinserti64x4     m11,             m11, ym12, 1

    vpermw           m13,             m28, m13
    vpermw           m18,             m28, m18
    vinserti64x4     m13,             m13, ym18, 1

    DCT16_avx512_PASS_1_E            -6 * 16, 21, -2 * 16, 20    ; row  2,  6
    DCT16_avx512_PASS_1_E             2 * 16, 22,  6 * 16, 21    ; row 10, 14

    vpaddw           m11,             m4,  m3
    vpaddw           m12,             m5,  m6
    vpaddw           m13,             m7,  m8
    vpaddw           m18,             m9,  m10

    vpermw           m11,             m28, m11
    vpermw           m12,             m28, m12
    vinserti64x4     m11,             m11, ym12, 1

    vpermw           m13,             m28, m13
    vpermw           m18,             m28, m18
    vinserti64x4     m13,             m13, ym18, 1

    DCT16_avx512_PASS_1_E            -8 * 16, 23, 0 * 16, 22    ; row 0, 8
    DCT16_avx512_PASS_1_E            -4 * 16, 24, 4 * 16, 23    ; row 4, 12

    ;PASS2
    vbroadcasti128    m0,             [pd_512]

    lea              r5,              [tab_dct16]
    mova             m2,              [dct16_shuf9_AVX512]
    vbroadcasti32x8  m3,              [dct16_shuf8_AVX512]

    vbroadcasti32x8  m1,              [r5 + 0 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 1 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 0 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 2 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 3 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 1 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 4 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 5 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 2 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 6 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 7 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 3 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 8 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 9 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 4 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 10 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 11 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 5 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 12 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 13 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 6 * 64],   m9

    vbroadcasti32x8  m1,              [r5 + 14 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 9
    vbroadcasti32x8  m1,              [r5 + 15 * 32]
    DCT16_PASS2_AVX512 1, 14, 15, 16, 17, 20, 21, 22, 23, 10
    vinserti64x4     m9,              m9, ym10, 1
    movu             [r1 + 7 * 64],   m9
    RET

%macro DCT32_PASS_1 4
    vbroadcasti128  m8,                [r7 + %1]
    pmaddwd         m11,               m%3, m8
    pmaddwd         m12,               m%4, m8
    phaddd          m11,               m12

    vbroadcasti128  m8,                [r7 + %1 + 32]
    vbroadcasti128  m10,               [r7 + %1 + 48]
    pmaddwd         m12,               m5, m8
    pmaddwd         m13,               m6, m10
    phaddd          m12,               m13

    pmaddwd         m13,               m4, m8
    pmaddwd         m14,               m7, m10
    phaddd          m13,               m14

    phaddd          m12,               m13

    phaddd          m11,               m12
    paddd           m11,               m9
    psrad           m11,               DCT_SHIFT

    vpermq          m11,               m11, 0xD8
    packssdw        m11,               m11
    movq            [r5 + %2],         xm11
    vextracti128    xm10,              m11, 1
    movq            [r5 + %2 + 64],    xm10
%endmacro

%macro DCT32_PASS_2 1
    mova            m8,                [r7 + %1]
    mova            m10,               [r8 + %1]
    pmaddwd         m11,               m0, m8
    pmaddwd         m12,               m1, m10
    paddd           m11,               m12

    pmaddwd         m12,               m2, m8
    pmaddwd         m13,               m3, m10
    paddd           m12,               m13

    phaddd          m11,               m12

    pmaddwd         m12,               m4, m8
    pmaddwd         m13,               m5, m10
    paddd           m12,               m13

    pmaddwd         m13,               m6, m8
    pmaddwd         m14,               m7, m10
    paddd           m13,               m14

    phaddd          m12,               m13

    phaddd          m11,               m12
    vextracti128    xm10,              m11, 1
    paddd           xm11,              xm10

    paddd           xm11,               xm9
    psrad           xm11,               DCT_SHIFT2
    packssdw        xm11,               xm11

%endmacro

INIT_YMM avx2
cglobal dct32, 3, 9, 16, 0-64*mmsize
%if BIT_DEPTH == 12
    %define         DCT_SHIFT          8
    vpbroadcastq    m9,                [pd_128]
%elif BIT_DEPTH == 10
    %define         DCT_SHIFT          6
    vpbroadcastq    m9,                [pd_32]
%elif BIT_DEPTH == 8
    %define         DCT_SHIFT          4
    vpbroadcastq    m9,                [pd_8]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         11

    add             r2d,               r2d

    lea             r7,                [tab_dct32_1]
    lea             r8,                [tab_dct32_2]
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r4d,               8
    mova            m15,               [dct16_shuf1]

.pass1:
    movu            m2,                [r0]
    movu            m1,                [r0 + 32]
    pshufb          m1,                m15
    vpermq          m1,                m1, 0x4E
    psubw           m7,                m2, m1
    paddw           m2,                m1

    movu            m1,                [r0 + r2 * 2]
    movu            m0,                [r0 + r2 * 2 + 32]
    pshufb          m0,                m15
    vpermq          m0,                m0, 0x4E
    psubw           m8,                m1, m0
    paddw           m1,                m0
    vperm2i128      m0,                m2, m1, 0x20        ; [row0lo  row2lo] for E
    vperm2i128      m3,                m2, m1, 0x31        ; [row0hi  row2hi] for E
    pshufb          m3,                m15
    psubw           m1,                m0, m3
    paddw           m0,                m3

    vperm2i128      m5,                m7, m8, 0x20        ; [row0lo  row2lo] for O
    vperm2i128      m6,                m7, m8, 0x31        ; [row0hi  row2hi] for O


    movu            m4,                [r0 + r2]
    movu            m2,                [r0 + r2 + 32]
    pshufb          m2,                m15
    vpermq          m2,                m2, 0x4E
    psubw           m10,               m4, m2
    paddw           m4,                m2

    movu            m3,                [r0 + r3]
    movu            m2,                [r0 + r3 + 32]
    pshufb          m2,                m15
    vpermq          m2,                m2, 0x4E
    psubw           m11,               m3, m2
    paddw           m3,                m2
    vperm2i128      m2,                m4, m3, 0x20        ; [row1lo  row3lo] for E
    vperm2i128      m8,                m4, m3, 0x31        ; [row1hi  row3hi] for E
    pshufb          m8,                m15
    psubw           m3,                m2, m8
    paddw           m2,                m8

    vperm2i128      m4,                m10, m11, 0x20      ; [row1lo  row3lo] for O
    vperm2i128      m7,                m10, m11, 0x31      ; [row1hi  row3hi] for O


    DCT32_PASS_1    0 * 32,            0 * 64, 0, 2
    DCT32_PASS_1    2 * 32,            2 * 64, 1, 3
    DCT32_PASS_1    4 * 32,            4 * 64, 0, 2
    DCT32_PASS_1    6 * 32,            6 * 64, 1, 3
    DCT32_PASS_1    8 * 32,            8 * 64, 0, 2
    DCT32_PASS_1    10 * 32,           10 * 64, 1, 3
    DCT32_PASS_1    12 * 32,           12 * 64, 0, 2
    DCT32_PASS_1    14 * 32,           14 * 64, 1, 3
    DCT32_PASS_1    16 * 32,           16 * 64, 0, 2
    DCT32_PASS_1    18 * 32,           18 * 64, 1, 3
    DCT32_PASS_1    20 * 32,           20 * 64, 0, 2
    DCT32_PASS_1    22 * 32,           22 * 64, 1, 3
    DCT32_PASS_1    24 * 32,           24 * 64, 0, 2
    DCT32_PASS_1    26 * 32,           26 * 64, 1, 3
    DCT32_PASS_1    28 * 32,           28 * 64, 0, 2
    DCT32_PASS_1    30 * 32,           30 * 64, 1, 3

    add             r5,                8
    lea             r0,                [r0 + r2 * 4]

    dec             r4d
    jnz             .pass1

    mov             r2d,               64
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r4d,               8
    vpbroadcastq    m9,                [pd_1024]

.pass2:
    mova            m0,                [r5 + 0 * 64]
    mova            m1,                [r5 + 0 * 64 + 32]

    mova            m2,                [r5 + 1 * 64]
    mova            m3,                [r5 + 1 * 64 + 32]

    mova            m4,                [r5 + 2 * 64]
    mova            m5,                [r5 + 2 * 64 + 32]

    mova            m6,                [r5 + 3 * 64]
    mova            m7,                [r5 + 3 * 64 + 32]

    DCT32_PASS_2    0 * 32
    movq            [r1],              xm11
    DCT32_PASS_2    1 * 32
    movq            [r1 + r2],         xm11
    DCT32_PASS_2    2 * 32
    movq            [r1 + r2 * 2],     xm11
    DCT32_PASS_2    3 * 32
    movq            [r1 + r3],         xm11

    lea             r6,                [r1 + r2 * 4]
    DCT32_PASS_2    4 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    5 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    6 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    7 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    8 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    9 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    10 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    11 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    12 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    13 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    14 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    15 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    16 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    17 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    18 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    19 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    20 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    21 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    22 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    23 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    24 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    25 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    26 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    27 * 32
    movq            [r6 + r3],         xm11

    lea             r6,                [r6 + r2 * 4]
    DCT32_PASS_2    28 * 32
    movq            [r6],              xm11
    DCT32_PASS_2    29 * 32
    movq            [r6 + r2],         xm11
    DCT32_PASS_2    30 * 32
    movq            [r6 + r2 * 2],     xm11
    DCT32_PASS_2    31 * 32
    movq            [r6 + r3],         xm11

    add             r5,                256
    add             r1,                8

    dec             r4d
    jnz             .pass2
    RET


%macro DCT32_avx512_LOOP 4
    movu            m1,               [r0]
    movu            m2,               [r0 + r2]

    vinserti64x4    m3,               m1, ym2, 1    ; row 0l, 1l
    vextracti64x4   ym4,              m1, 1
    vinserti64x4    m2,               m2, ym4, 0    ; row 0h, 1h
    vpermw          m2,               m31, m2

    psubw           m%1,              m3, m2        ; O
    paddw           m3,               m2            ; E
    mova            [r9 + %3 * 64],   m3

    movu            m1,               [r0 + 2 * r2]
    movu            m5,               [r0 + r3]

    vinserti64x4    m6,               m1, ym5, 1    ; row 2l, 3l
    vextracti64x4   ym7,              m1, 1
    vinserti64x4    m5,               m5, ym7, 0    ; row 2h, 3h
    vpermw          m5,               m31, m5

    psubw           m%2,              m6, m5        ; O
    paddw           m6,               m5            ; E
    mova            [r9 + %4 * 64],   m6
%endmacro

%macro DCT32_avx512_PASS_1_O 3
    pmaddwd          m10,              m%2,  m9
    vpsrldq          m11,              m10, 8
    vpaddd           m10,              m11

    pmaddwd          m11,              m%3,  m9
    vpsrldq          m12,              m11, 8
    vpaddd           m11,              m12

    mova             m12,              m8
    vpermi2d         m12,              m10, m11
    vpsrldq          m10,              m12, 8
    vpaddd           m12,              m10
    vpsrldq          m10,              m12, 4
    vpaddd           m12,              m10

    vpaddd           m12,              m0
    vpsrad           m12,              DCT_SHIFT
    vpackssdw        m12,              m12
    vpermw           m12,              m30, m12
    movq             [r5 + %1],        xm12
%endmacro

%macro DCT32_avx512_PASS_1_ROW_O 0
    vbroadcasti32x8  m9,               [r7 + 1 * 32]

    DCT32_avx512_LOOP 13, 14, 0, 1
    DCT32_avx512_PASS_1_O              1 * 64 + 0 * 8, 13, 14

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 15, 16, 2, 3
    DCT32_avx512_PASS_1_O              1 * 64 + 1 * 8, 15, 16

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 17, 18, 4, 5
    DCT32_avx512_PASS_1_O              1 * 64 + 2 * 8, 17, 18

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 19, 20, 6, 7
    DCT32_avx512_PASS_1_O              1 * 64 + 3 * 8, 19, 20

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 21, 22, 8, 9
    DCT32_avx512_PASS_1_O              1 * 64 + 4 * 8, 21, 22

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 23, 24, 10, 11
    DCT32_avx512_PASS_1_O              1 * 64 + 5 * 8, 23, 24

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 25, 26, 12, 13
    DCT32_avx512_PASS_1_O              1 * 64 + 6 * 8, 25, 26

    lea             r0,                [r0 + 4 * r2]
    DCT32_avx512_LOOP 27, 28, 14, 15
    DCT32_avx512_PASS_1_O              1 * 64 + 7 * 8, 27, 28
%endmacro

%macro DCT32_avx512_PASS_1_ROW_O_1_7 1
    vbroadcasti32x8  m9,               [r7 + %1 * 32]

    DCT32_avx512_PASS_1_O              %1 * 64 + 0 * 8, 13, 14
    DCT32_avx512_PASS_1_O              %1 * 64 + 1 * 8, 15, 16
    DCT32_avx512_PASS_1_O              %1 * 64 + 2 * 8, 17, 18
    DCT32_avx512_PASS_1_O              %1 * 64 + 3 * 8, 19, 20
    DCT32_avx512_PASS_1_O              %1 * 64 + 4 * 8, 21, 22
    DCT32_avx512_PASS_1_O              %1 * 64 + 5 * 8, 23, 24
    DCT32_avx512_PASS_1_O              %1 * 64 + 6 * 8, 25, 26
    DCT32_avx512_PASS_1_O              %1 * 64 + 7 * 8, 27, 28
%endmacro

%macro DCT32_avx512_LOOP_EO 4
    mova            m4,                [rsp + 32 * mmsize + %3 * 64]
    vpermw          m4,                m8, m4
    vextracti64x4   ym5,               m4, 1

    mova            m6,                [rsp + 32 * mmsize + %4 * 64]
    vpermw          m6,                m8, m6
    vextracti64x4   ym7,               m6, 1

    vinserti64x4    m4,                m4, ym6, 1
    vinserti64x4    m5,                m5, ym7, 1

    psubw           m%1,               m4, m5      ; EO
    paddw           m%2,               m4, m5      ; EE
%endmacro

%macro DCT32_avx512_PASS_1_ROW_EO 2
    pmaddwd          m29,              m%2,  m12
    vpsrldq          m30,              m29,  8
    vpaddd           m30,              m29
    vpsrldq          m29,              m30,  4
    vpaddd           m29,              m30

    vpaddd           m29,              m0
    vpsrad           m29,              DCT_SHIFT
    vpackssdw        m29,              m29

    vpermw           m29,              m11, m29
    movq             [r5 + %1],        xm29
%endmacro

%macro DCT32_avx512_PASS_1_ROW_EO_0 0

    mova            m8,               [dct32_shuf2_AVX512]
    vbroadcasti32x4 m12,              [r7 + 2 * 32]

    DCT32_avx512_LOOP_EO 13, 14, 0, 1
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 0 * 8, 13

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 15, 16, 2, 3
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 1 * 8, 15

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 17, 18, 4, 5
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 2 * 8, 17

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 19, 20, 6, 7
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 3 * 8, 19

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 21, 22, 8, 9
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 4 * 8, 21

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 23, 24, 10, 11
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 5 * 8, 23

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 25, 26, 12, 13
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 6 * 8, 25

    lea             r9,           [r9 + 4 * r2]
    DCT32_avx512_LOOP_EO 27, 28, 14, 15
    DCT32_avx512_PASS_1_ROW_EO    2 * 64 + 7 * 8, 27

%endmacro

%macro DCT32_avx512_PASS_1_ROW_EO_1_7 1

    vbroadcasti32x4 m12,         [r7 + %1 * 32]

    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 0 * 8, 13
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 1 * 8, 15
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 2 * 8, 17
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 3 * 8, 19
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 4 * 8, 21
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 5 * 8, 23
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 6 * 8, 25
    DCT32_avx512_PASS_1_ROW_EO   %1 * 64 + 7 * 8, 27

%endmacro

%macro DCT32_avx512_LOOP_EEO 0
    vpunpcklqdq        m2,  m14, m16
    vpunpckhqdq        m14, m16
    vpshufb            m14, m31

    vpaddw             m16, m2, m14     ; EEE
    vpsubw             m2,  m14         ; EE0

    vpunpcklqdq        m3,  m18, m20
    vpunpckhqdq        m18, m20
    vpshufb            m18, m31

    vpaddw             m20, m3, m18     ; EEE
    vpsubw             m3,  m18         ; EE0

    vpunpcklqdq        m4,  m22, m24
    vpunpckhqdq        m22, m24
    vpshufb            m22, m31

    vpaddw             m24, m4, m22     ; EEE
    vpsubw             m4,  m22         ; EE0

    vpunpcklqdq        m5,  m26, m28
    vpunpckhqdq        m26, m28
    vpshufb            m26, m31

    vpaddw             m28, m5, m26     ; EEE
    vpsubw             m5,  m26         ; EE0
%endmacro

%macro DCT32_avx512_PASS_1_ROW_EEO 2
    pmaddwd          m30,              m%2,  m1
    vpsrldq          m29,              m30,  4
    vpaddd           m29,              m30

    vpaddd           m29,              m0
    vpsrad           m29,              DCT_SHIFT
    vpackssdw        m29,              m29

    vpermw           m29,              m27, m29
    movu             [r5 + %1],        xm29
%endmacro

%macro DCT32_avx512_PASS_1_ROW_EEO_1_4 1

vpbroadcastq     m1,            [r7 + %1 * 32]
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 0 * 16, 2
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 1 * 16, 3
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 2 * 16, 4
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 3 * 16, 5

%endmacro

%macro DCT32_avx512_PASS_1_ROW_EEEO_1_4 1

vpbroadcastq     m1,            [r7 + %1 * 32]
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 0 * 16, 16
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 1 * 16, 20
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 2 * 16, 24
DCT32_avx512_PASS_1_ROW_EEO     %1 * 64 + 3 * 16, 28

%endmacro

%macro DCT32_avx512_PASS2_OPT 5
    pmaddwd         m9,                m1,  m%1
    vpsrldq         m10,               m9,  8
    vpaddd          m9,                m10

    pmaddwd         m10,               m1,  m%2
    vpsrldq         m11,               m10, 8
    vpaddd          m10,               m11

    pmaddwd         m11,               m1,  m%3
    vpsrldq         m12,               m11, 8
    vpaddd          m11,               m12

    pmaddwd         m12,               m1,  m%4
    vpsrldq         m13,               m12, 8
    vpaddd          m12,               m13

    vpsrldq         m13,               m9,  4
    vpaddd          m9,                m13
    vpsrldq         m13,               m10, 4
    vpaddd          m10,               m13
    vpsrldq         m13,               m11, 4
    vpaddd          m11,               m13
    vpsrldq         m13,               m12, 4
    vpaddd          m12,               m13

    vpermd           m9,               m31,  m9
    vpermd          m10,               m31, m10
    vpermd          m11,               m31, m11
    vpermd          m12,               m31, m12

    vpandd          m9,                m27
    vpandd          m10,               m30
    vpandd          m11,               m29
    vpandd          m12,               m28

    vpaddd          m9,                m10
    vpaddd          m11,               m12
    vpaddd          m9,                m11

    vpsrldq         m10,               m9, 8
    vpaddd          m9,                m10
    vpsrldq         m10,               m9, 4
    vpaddd          m9,                m10

    vpermd          m9,                m31, m9
    vpaddd          m9,                m0
    vpsrad          m9,                DCT_SHIFT2
    vpackssdw       m9,                m9
    movq            [r1 + %5],         xm9

%endmacro

%macro DCT32_avx512_PASS2 5

    mova            m9,                [r5 + %1]
    mova            m10,               [r5 + %2]
    mova            m11,               [r5 + %3]
    mova            m12,               [r5 + %4]

    pmaddwd         m9,                m1,  m9
    vpsrldq         m13,               m9,  8
    vpaddd          m9,                m13

    pmaddwd         m10,               m1,  m10
    vpsrldq         m13,               m10, 8
    vpaddd          m10,               m13

    pmaddwd         m11,               m1,  m11
    vpsrldq         m13,               m11, 8
    vpaddd          m11,               m13

    pmaddwd         m12,               m1,  m12
    vpsrldq         m13,               m12, 8
    vpaddd          m12,               m13

    vpsrldq         m13,               m9,  4
    vpaddd          m9,                m13
    vpsrldq         m13,               m10, 4
    vpaddd          m10,               m13
    vpsrldq         m13,               m11, 4
    vpaddd          m11,               m13
    vpsrldq         m13,               m12, 4
    vpaddd          m12,               m13

    vpermd           m9,               m31,  m9
    vpermd          m10,               m31, m10
    vpermd          m11,               m31, m11
    vpermd          m12,               m31, m12

    vpandd          m9,                m27
    vpandd          m10,               m30
    vpandd          m11,               m29
    vpandd          m12,               m28

    vpaddd          m9,                m10
    vpaddd          m11,               m12
    vpaddd          m9,                m11

    vpsrldq         m10,               m9, 8
    vpaddd          m9,                m10
    vpsrldq         m10,               m9, 4
    vpaddd          m9,                m10

    vpermd          m9,                m31, m9
    vpaddd          m9,                m0
    vpsrad          m9,                DCT_SHIFT2
    vpackssdw       m9,                m9
    movq            [r1 + %5],         xm9

%endmacro

%macro DCT32_avx512_PASS2_1_ROW 1

mova            m1,                [r8 + %1 * 64]

DCT32_avx512_PASS2_OPT  2,  3,  4, 14, %1 * 64 + 0 * 8
DCT32_avx512_PASS2_OPT 15, 16, 17, 18, %1 * 64 + 1 * 8
DCT32_avx512_PASS2_OPT 19, 20, 21, 22, %1 * 64 + 2 * 8
DCT32_avx512_PASS2_OPT 23, 24, 25, 26, %1 * 64 + 3 * 8
DCT32_avx512_PASS2_OPT  5,  6,  7,  8, %1 * 64 + 4 * 8

DCT32_avx512_PASS2 20 * 64, 21 * 64, 22 * 64, 23 * 64, %1 * 64 + 5 * 8
DCT32_avx512_PASS2 24 * 64, 25 * 64, 26 * 64, 27 * 64, %1 * 64 + 6 * 8
DCT32_avx512_PASS2 28 * 64, 29 * 64, 30 * 64, 31 * 64, %1 * 64 + 7 * 8

%endmacro

INIT_ZMM avx512
cglobal dct32, 3, 10, 32, 0-(32*mmsize + 16*mmsize)

%if BIT_DEPTH == 12
    %define         DCT_SHIFT          8
    vpbroadcastq    m0,                [pd_128]
%elif BIT_DEPTH == 10
    %define         DCT_SHIFT          6
    vpbroadcastq    m0,                [pd_32]
%elif BIT_DEPTH == 8
    %define         DCT_SHIFT          4
    vpbroadcastq    m0,                [pd_8]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             DCT_SHIFT2         11

    add             r2d,               r2d
    lea             r7,                [tab_dct32_1]
    lea             r8,                [tab_dct32]
    lea             r3,                [r2 * 3]
    mov             r5,                rsp
    mov             r9,                2048    ; 32 * mmsize
    add             r9,                rsp

    mova            m31,               [dct32_shuf1_AVX512]

    ; PASSS 1

    vbroadcasti32x8 m30,               [dct8_shuf9_AVX512]
    mova            m8,                [dct32_shuf_AVX512]

    DCT32_avx512_PASS_1_ROW_O
    DCT32_avx512_PASS_1_ROW_O_1_7  3
    DCT32_avx512_PASS_1_ROW_O_1_7  5
    DCT32_avx512_PASS_1_ROW_O_1_7  7
    DCT32_avx512_PASS_1_ROW_O_1_7  9
    DCT32_avx512_PASS_1_ROW_O_1_7 11
    DCT32_avx512_PASS_1_ROW_O_1_7 13
    DCT32_avx512_PASS_1_ROW_O_1_7 15
    DCT32_avx512_PASS_1_ROW_O_1_7 17
    DCT32_avx512_PASS_1_ROW_O_1_7 19
    DCT32_avx512_PASS_1_ROW_O_1_7 20
    DCT32_avx512_PASS_1_ROW_O_1_7 21
    DCT32_avx512_PASS_1_ROW_O_1_7 23
    DCT32_avx512_PASS_1_ROW_O_1_7 25
    DCT32_avx512_PASS_1_ROW_O_1_7 27
    DCT32_avx512_PASS_1_ROW_O_1_7 29
    DCT32_avx512_PASS_1_ROW_O_1_7 31

    vbroadcasti32x8  m11,               [dct8_shuf9_AVX512]

    DCT32_avx512_PASS_1_ROW_EO_0
    DCT32_avx512_PASS_1_ROW_EO_1_7 6
    DCT32_avx512_PASS_1_ROW_EO_1_7 10
    DCT32_avx512_PASS_1_ROW_EO_1_7 14
    DCT32_avx512_PASS_1_ROW_EO_1_7 18
    DCT32_avx512_PASS_1_ROW_EO_1_7 22
    DCT32_avx512_PASS_1_ROW_EO_1_7 26
    DCT32_avx512_PASS_1_ROW_EO_1_7 30

    vbroadcasti32x4  m31,               [dct8_shuf]
    vbroadcasti32x8  m27,               [dct32_shuf3_AVX512]

    DCT32_avx512_LOOP_EEO
    DCT32_avx512_PASS_1_ROW_EEO_1_4 4
    DCT32_avx512_PASS_1_ROW_EEO_1_4 12
    DCT32_avx512_PASS_1_ROW_EEO_1_4 20
    DCT32_avx512_PASS_1_ROW_EEO_1_4 28

    DCT32_avx512_PASS_1_ROW_EEEO_1_4 0
    DCT32_avx512_PASS_1_ROW_EEEO_1_4 16
    DCT32_avx512_PASS_1_ROW_EEEO_1_4 8
    DCT32_avx512_PASS_1_ROW_EEEO_1_4 24

    ; PASS 2

    vpbroadcastq    m0,               [pd_1024]
    vbroadcasti32x8 m31,              [dct32_shuf4_AVX512]
    movu            m30,              [dct32_shuf5_AVX512]
    movu            m29,              [dct32_shuf6_AVX512]
    movu            m28,              [dct32_shuf7_AVX512]
    movu            m27,              [dct32_shuf8_AVX512]

    ;Load the source coefficents into free registers and reuse them for all rows

    mova            m2,               [r5 +  0 * 64]
    mova            m3,               [r5 +  1 * 64]
    mova            m4,               [r5 +  2 * 64]
    mova            m14,              [r5 +  3 * 64]
    mova            m15,              [r5 +  4 * 64]
    mova            m16,              [r5 +  5 * 64]
    mova            m17,              [r5 +  6 * 64]
    mova            m18,              [r5 +  7 * 64]
    mova            m19,              [r5 +  8 * 64]
    mova            m20,              [r5 +  9 * 64]
    mova            m21,              [r5 + 10 * 64]
    mova            m22,              [r5 + 11 * 64]
    mova            m23,              [r5 + 12 * 64]
    mova            m24,              [r5 + 13 * 64]
    mova            m25,              [r5 + 14 * 64]
    mova            m26,              [r5 + 15 * 64]
    mova             m5,              [r5 + 16 * 64]
    mova             m6,              [r5 + 17 * 64]
    mova             m7,              [r5 + 18 * 64]
    mova             m8,              [r5 + 19 * 64]

    DCT32_avx512_PASS2_1_ROW 0
    DCT32_avx512_PASS2_1_ROW 1
    DCT32_avx512_PASS2_1_ROW 2
    DCT32_avx512_PASS2_1_ROW 3
    DCT32_avx512_PASS2_1_ROW 4
    DCT32_avx512_PASS2_1_ROW 5
    DCT32_avx512_PASS2_1_ROW 6
    DCT32_avx512_PASS2_1_ROW 7
    DCT32_avx512_PASS2_1_ROW 8
    DCT32_avx512_PASS2_1_ROW 9
    DCT32_avx512_PASS2_1_ROW 10
    DCT32_avx512_PASS2_1_ROW 11
    DCT32_avx512_PASS2_1_ROW 12
    DCT32_avx512_PASS2_1_ROW 13
    DCT32_avx512_PASS2_1_ROW 14
    DCT32_avx512_PASS2_1_ROW 15
    DCT32_avx512_PASS2_1_ROW 16
    DCT32_avx512_PASS2_1_ROW 17
    DCT32_avx512_PASS2_1_ROW 18
    DCT32_avx512_PASS2_1_ROW 19
    DCT32_avx512_PASS2_1_ROW 20
    DCT32_avx512_PASS2_1_ROW 21
    DCT32_avx512_PASS2_1_ROW 22
    DCT32_avx512_PASS2_1_ROW 23
    DCT32_avx512_PASS2_1_ROW 24
    DCT32_avx512_PASS2_1_ROW 25
    DCT32_avx512_PASS2_1_ROW 26
    DCT32_avx512_PASS2_1_ROW 27
    DCT32_avx512_PASS2_1_ROW 28
    DCT32_avx512_PASS2_1_ROW 29
    DCT32_avx512_PASS2_1_ROW 30
    DCT32_avx512_PASS2_1_ROW 31

    RET

%macro IDCT8_PASS_1 1
    vpbroadcastd    m7,                [r5 + %1]
    vpbroadcastd    m10,               [r5 + %1 + 4]
    pmaddwd         m5,                m4, m7
    pmaddwd         m6,                m0, m10
    paddd           m5,                m6

    vpbroadcastd    m7,                [r6 + %1]
    vpbroadcastd    m10,               [r6 + %1 + 4]
    pmaddwd         m6,                m1, m7
    pmaddwd         m3,                m2, m10
    paddd           m6,                m3

    paddd           m3,                m5, m6
    paddd           m3,                m11
    psrad           m3,                IDCT_SHIFT1

    psubd           m5,                m6
    paddd           m5,                m11
    psrad           m5,                IDCT_SHIFT1

    vpbroadcastd    m7,                [r5 + %1 + 32]
    vpbroadcastd    m10,               [r5 + %1 + 36]
    pmaddwd         m6,                m4, m7
    pmaddwd         m8,                m0, m10
    paddd           m6,                m8

    vpbroadcastd    m7,                [r6 + %1 + 32]
    vpbroadcastd    m10,               [r6 + %1 + 36]
    pmaddwd         m8,                m1, m7
    pmaddwd         m9,                m2, m10
    paddd           m8,                m9

    paddd           m9,                m6, m8
    paddd           m9,                m11
    psrad           m9,                IDCT_SHIFT1

    psubd           m6,                m8
    paddd           m6,                m11
    psrad           m6,                IDCT_SHIFT1

    packssdw        m3,                m9
    vpermq          m3,                m3, 0xD8

    packssdw        m6,                m5
    vpermq          m6,                m6, 0xD8
%endmacro

%macro IDCT8_PASS_2 0
    punpcklqdq      m2,                m0, m1
    punpckhqdq      m0,                m1

    pmaddwd         m3,                m2, [r5]
    pmaddwd         m5,                m2, [r5 + 32]
    pmaddwd         m6,                m2, [r5 + 64]
    pmaddwd         m7,                m2, [r5 + 96]
    phaddd          m3,                m5
    phaddd          m6,                m7
    pshufb          m3,                [idct8_shuf2]
    pshufb          m6,                [idct8_shuf2]
    punpcklqdq      m7,                m3, m6
    punpckhqdq      m3,                m6

    pmaddwd         m5,                m0, [r6]
    pmaddwd         m6,                m0, [r6 + 32]
    pmaddwd         m8,                m0, [r6 + 64]
    pmaddwd         m9,                m0, [r6 + 96]
    phaddd          m5,                m6
    phaddd          m8,                m9
    pshufb          m5,                [idct8_shuf2]
    pshufb          m8,                [idct8_shuf2]
    punpcklqdq      m6,                m5, m8
    punpckhqdq      m5,                m8

    paddd           m8,                m7, m6
    paddd           m8,                m12
    psrad           m8,                IDCT_SHIFT2

    psubd           m7,                m6
    paddd           m7,                m12
    psrad           m7,                IDCT_SHIFT2

    pshufb          m7,                [idct8_shuf3]
    packssdw        m8,                 m7

    paddd           m9,                m3, m5
    paddd           m9,                m12
    psrad           m9,                IDCT_SHIFT2

    psubd           m3,                m5
    paddd           m3,                m12
    psrad           m3,                IDCT_SHIFT2

    pshufb          m3,                [idct8_shuf3]
    packssdw        m9,                m3
%endmacro

INIT_YMM avx2
cglobal idct8, 3, 7, 13, 0-8*16
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m12,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m12,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m12,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             IDCT_SHIFT1         7

    vbroadcasti128  m11,               [pd_64]

    mov             r4,                rsp
    lea             r5,                [avx2_idct8_1]
    lea             r6,                [avx2_idct8_2]

    ;pass1
    mova            m1,                [r0 + 0 * 32]     ; [0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1]
    mova            m0,                [r0 + 1 * 32]     ; [2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3]
    vpunpcklwd      m5,      m1,       m0                ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
    vpunpckhwd      m1,      m0                          ; [0 2 0 2 0 2 0 2 1 3 1 3 1 3 1 3]
    vinserti128     m4,      m5,       xm1,       1      ; [0 2 0 2 0 2 0 2 0 2 0 2 0 2 0 2]
    vextracti128    xm2,     m5,       1                 ; [1 3 1 3 1 3 1 3]
    vinserti128     m1,      m1,       xm2,       0      ; [1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3]

    mova            m2,                [r0 + 2 * 32]     ; [4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5]
    mova            m0,                [r0 + 3 * 32]     ; [6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7]
    vpunpcklwd      m5,      m2,       m0                ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
    vpunpckhwd      m2,      m0                          ; [4 6 4 6 4 6 4 6 5 7 5 7 5 7 5 7]
    vinserti128     m0,      m5,       xm2,       1     ; [4 6 4 6 4 6 4 6 4 6 4 6 4 6 4 6]
    vextracti128    xm5,     m5,       1                ; [5 7 5 7 5 7 5 7]
    vinserti128     m2,      m2,       xm5,       0     ; [5 7 5 7 5 7 5 7 5 7 5 7 5 7 5 7]

    mova            m5,                [idct8_shuf1]
    vpermd          m4,                m5, m4
    vpermd          m0,                m5, m0
    vpermd          m1,                m5, m1
    vpermd          m2,                m5, m2

    IDCT8_PASS_1    0
    mova            [r4],              m3
    mova            [r4 + 96],         m6

    IDCT8_PASS_1    64
    mova            [r4 + 32],         m3
    mova            [r4 + 64],         m6

    ;pass2
    add             r2d,               r2d
    lea             r3,                [r2 * 3]

    mova            m0,                [r4]
    mova            m1,                [r4 + 32]
    IDCT8_PASS_2

    vextracti128    xm3,               m8, 1
    mova            [r1],              xm8
    mova            [r1 + r2],         xm3
    vextracti128    xm3,               m9, 1
    mova            [r1 + r2 * 2],     xm9
    mova            [r1 + r3],         xm3

    lea             r1,                [r1 + r2 * 4]
    mova            m0,                [r4 + 64]
    mova            m1,                [r4 + 96]
    IDCT8_PASS_2

    vextracti128    xm3,               m8, 1
    mova            [r1],              xm8
    mova            [r1 + r2],         xm3
    vextracti128    xm3,               m9, 1
    mova            [r1 + r2 * 2],     xm9
    mova            [r1 + r3],         xm3
    RET


%macro IDCT8_AVX512_PASS_1 0
    pmaddwd         m5,                m29, m17
    pmaddwd         m6,                m25, m18
    paddd           m5,                m6

    pmaddwd         m6,                m30, m21
    pmaddwd         m3,                m26, m22
    paddd           m6,                m3

    paddd           m3,                m5, m6
    paddd           m3,                m11
    psrad           m3,                IDCT_SHIFT1

    psubd           m5,                m6
    paddd           m5,                m11
    psrad           m5,                IDCT_SHIFT1

    pmaddwd         m6,                m29, m19
    pmaddwd         m8,                m25, m20
    paddd           m6,                m8

    pmaddwd         m8,                m30, m23
    pmaddwd         m9,                m26, m24
    paddd           m8,                m9

    paddd           m9,                m6, m8
    paddd           m9,                m11
    psrad           m9,                IDCT_SHIFT1

    psubd           m6,                m8
    paddd           m6,                m11
    psrad           m6,                IDCT_SHIFT1

    packssdw        m3,                m9
    vpermq          m3,                m3, 0xD8

    packssdw        m6,                m5
    vpermq          m6,                m6, 0xD8
%endmacro


%macro IDCT8_AVX512_PASS_2 0
    mov             r7d, 0xAAAA
    kmovd           k1, r7d
    punpcklqdq      m2,                m3, m13
    punpckhqdq      m0,                m3, m13

    pmaddwd         m3,                m2, [r5]
    pmaddwd         m5,                m2, [r5 + 1 * mmsize]
    pmaddwd         m6,                m2, [r5 + 2 * mmsize]
    pmaddwd         m7,                m2, [r5 + 3 * mmsize]

    vpsrldq         m14,   m3, 4
    paddd            m3,  m14
    vpslldq         m16,   m5, 4
    paddd            m5,  m16
    vmovdqu32        m3   {k1}, m5

    vpsrldq         m14,   m6, 4
    paddd            m6,  m14
    vpslldq         m16,   m7, 4
    paddd            m7,  m16
    vmovdqu32        m6   {k1}, m7

    punpcklqdq      m7,                m3, m6
    punpckhqdq      m3,                m6

    pmaddwd         m5,                m0, [r6]
    pmaddwd         m6,                m0, [r6 + 1 * mmsize]
    pmaddwd         m8,                m0, [r6 + 2 * mmsize]
    pmaddwd         m9,                m0, [r6 + 3 * mmsize]

    vpsrldq         m14,   m5, 4
    paddd            m5,  m14
    vpslldq         m16,   m6, 4
    paddd            m6,  m16
    vmovdqu32        m5   {k1}, m6

    vpsrldq         m14,   m8, 4
    paddd            m8,  m14
    vpslldq         m16,   m9, 4
    paddd            m9,  m16
    vmovdqu32        m8   {k1}, m9

    punpcklqdq      m6,                m5, m8
    punpckhqdq      m5,                m8

    paddd           m8,                m7, m6
    paddd           m8,                m12
    psrad           m8,                IDCT_SHIFT2

    psubd           m7,                m6
    paddd           m7,                m12
    psrad           m7,                IDCT_SHIFT2

    pshufb          m7,                [idct8_avx512_shuf3]
    packssdw        m8,                 m7

    paddd           m9,                m3, m5
    paddd           m9,                m12
    psrad           m9,                IDCT_SHIFT2

    psubd           m3,                m5
    paddd           m3,                m12
    psrad           m3,                IDCT_SHIFT2

    pshufb          m3,                [idct8_avx512_shuf3]
    packssdw        m9,                m3
%endmacro


%if ARCH_X86_64
INIT_ZMM avx512
cglobal idct8, 3, 8, 31
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m12,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m12,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m12,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             IDCT_SHIFT1         7

    vpbroadcastd     m11,               [pd_64]

    lea             r4,                [avx512_idct8_3]
    lea             r5,                [avx2_idct8_1]
    lea             r6,                [avx2_idct8_2]
    movu           m16,                [idct16_shuff2]
    movu           m17,                [idct16_shuff3]

    ;pass1
    mova            ym1, [r0 + 0 * 32]
    mova            ym0, [r0 + 1 * 32]
    mova            ym25, ym16
    mova            ym26, ym17
    vpermi2w        ym25,  ym1, ym0
    vpermi2w        ym26,  ym1, ym0

    mova            ym1, [r0 + 2 * 32]
    mova            ym0, [r0 + 3 * 32]
    mova            ym27, ym16
    mova            ym28, ym17
    vpermi2w        ym27,  ym1, ym0
    vpermi2w        ym28,  ym1, ym0
    
    vperm2i128      ym29, ym25, ym26, 0x20
    vperm2i128      ym30, ym25, ym26, 0x31
    vperm2i128      ym25, ym27, ym28, 0x20
    vperm2i128      ym26, ym27, ym28, 0x31

    vinserti64x4    m29,        m29,      ym29, 1
    vinserti64x4    m25,        m25,      ym25, 1
    vinserti64x4    m30,        m30,      ym30, 1
    vinserti64x4    m26,        m26,      ym26, 1

    movu            m17,                [r4]
    movu            m18,                [r4 + 1 * mmsize]
    movu            m19,                [r4 + 2 * mmsize]
    movu            m20,                [r4 + 3 * mmsize]
    movu            m21,                [r4 + 4 * mmsize]
    movu            m22,                [r4 + 5 * mmsize]
    movu            m23,                [r4 + 6 * mmsize]
    movu            m24,                [r4 + 7 * mmsize]

    IDCT8_AVX512_PASS_1

    vextracti64x4   ym13,       m3,      1
    vextracti64x4   ym14,       m6,      1
    vinserti64x4      m3,       m3,      ym14, 1
    vinserti64x4     m13,      m13,       ym6, 1

    ;pass2
    add             r2d,               r2d
    lea             r3,                [r2 * 3]
    lea             r5,                [avx512_idct8_1]
    lea             r6,                [avx512_idct8_2]

    IDCT8_AVX512_PASS_2

    vextracti128    xm3,               ym8, 1
    mova            [r1],              xm8
    mova            [r1 + r2],         xm3
    vextracti128    xm3,               ym9, 1
    mova            [r1 + r2 * 2],     xm9
    mova            [r1 + r3],         xm3

    lea             r1,                [r1 + r2 * 4]

    vextracti64x4   ym10,   m8, 1
    vextracti64x4   ym11,   m9, 1

    vextracti128    xm3,               ym10, 1
    mova            [r1],              xm10
    mova            [r1 + r2],         xm3
    vextracti128    xm3,               ym11, 1
    mova            [r1 + r2 * 2],     xm11
    mova            [r1 + r3],         xm3
    RET
%endif

%macro IDCT_PASS1 2
    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16]

    pmaddwd         m9, m0, m5
    pmaddwd         m10, m7, m5
    phaddd          m9, m10

    pmaddwd         m10, m6, m5
    pmaddwd         m11, m8, m5
    phaddd          m10, m11

    phaddd          m9, m10
    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16]

    pmaddwd         m10, m1, m5
    pmaddwd         m11, m3, m5
    phaddd          m10, m11

    pmaddwd         m11, m4, m5
    pmaddwd         m12, m2, m5
    phaddd          m11, m12

    phaddd          m10, m11

    paddd           m11, m9, m10
    paddd           m11, m14
    psrad           m11, IDCT_SHIFT1

    psubd           m9, m10
    paddd           m9, m14
    psrad           m9, IDCT_SHIFT1

    vbroadcasti128  m5, [tab_idct16_2 + %1 * 16 + 16]

    pmaddwd         m10, m0, m5
    pmaddwd         m12, m7, m5
    phaddd          m10, m12

    pmaddwd         m12, m6, m5
    pmaddwd         m13, m8, m5
    phaddd          m12, m13

    phaddd          m10, m12
    vbroadcasti128  m5, [tab_idct16_1 + %1 * 16  + 16]

    pmaddwd         m12, m1, m5
    pmaddwd         m13, m3, m5
    phaddd          m12, m13

    pmaddwd         m13, m4, m5
    pmaddwd         m5, m2
    phaddd          m13, m5

    phaddd          m12, m13

    paddd           m5, m10, m12
    paddd           m5, m14
    psrad           m5, IDCT_SHIFT1

    psubd           m10, m12
    paddd           m10, m14
    psrad           m10, IDCT_SHIFT1

    packssdw        m11, m5
    packssdw        m9, m10

    mova            m10, [idct16_shuff]
    mova            m5,  [idct16_shuff1]

    vpermd          m12, m10, m11
    vpermd          m13, m5, m9
    mova            [r3 + %1 * 16 * 2], xm12
    mova            [r3 + %2 * 16 * 2], xm13
    vextracti128    [r3 + %2 * 16 * 2 + 32], m13, 1
    vextracti128    [r3 + %1 * 16 * 2 + 32], m12, 1
%endmacro

;-------------------------------------------------------
; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct16, 3, 7, 16, 0-16*mmsize
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m15,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m15,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m15,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             IDCT_SHIFT1         7

    vbroadcasti128  m14,               [pd_64]

    add             r2d,               r2d
    mov             r3, rsp
    mov             r4d, 2

.pass1:
     movu            xm0, [r0 +  0 * 32]
     movu            xm1, [r0 +  8 * 32]
     punpckhqdq      xm2, xm0, xm1
     punpcklqdq      xm0, xm1
     vinserti128     m0, m0, xm2, 1

     movu            xm1, [r0 +  1 * 32]
     movu            xm2, [r0 +  9 * 32]
     punpckhqdq      xm3, xm1, xm2
     punpcklqdq      xm1, xm2
     vinserti128     m1, m1, xm3, 1

     movu            xm2, [r0 + 2  * 32]
     movu            xm3, [r0 + 10 * 32]
     punpckhqdq      xm4, xm2, xm3
     punpcklqdq      xm2, xm3
     vinserti128     m2, m2, xm4, 1

     movu            xm3, [r0 + 3  * 32]
     movu            xm4, [r0 + 11 * 32]
     punpckhqdq      xm5, xm3, xm4
     punpcklqdq      xm3, xm4
     vinserti128     m3, m3, xm5, 1

     movu            xm4, [r0 + 4  * 32]
     movu            xm5, [r0 + 12 * 32]
     punpckhqdq      xm6, xm4, xm5
     punpcklqdq      xm4, xm5
     vinserti128     m4, m4, xm6, 1

     movu            xm5, [r0 + 5  * 32]
     movu            xm6, [r0 + 13 * 32]
     punpckhqdq      xm7, xm5, xm6
     punpcklqdq      xm5, xm6
     vinserti128     m5, m5, xm7, 1

     movu            xm6, [r0 + 6  * 32]
     movu            xm7, [r0 + 14 * 32]
     punpckhqdq      xm8, xm6, xm7
     punpcklqdq      xm6, xm7
     vinserti128     m6, m6, xm8, 1

     movu            xm7, [r0 + 7  * 32]
     movu            xm8, [r0 + 15 * 32]
     punpckhqdq      xm9, xm7, xm8
     punpcklqdq      xm7, xm8
     vinserti128     m7, m7, xm9, 1

    punpckhwd       m8, m0, m2                ;[8 10]
    punpcklwd       m0, m2                    ;[0 2]

    punpckhwd       m2, m1, m3                ;[9 11]
    punpcklwd       m1, m3                    ;[1 3]

    punpckhwd       m3, m4, m6                ;[12 14]
    punpcklwd       m4, m6                    ;[4 6]

    punpckhwd       m6, m5, m7                ;[13 15]
    punpcklwd       m5, m7                    ;[5 7]

    punpckhdq       m7, m0, m4                ;[02 22 42 62 03 23 43 63 06 26 46 66 07 27 47 67]
    punpckldq       m0, m4                    ;[00 20 40 60 01 21 41 61 04 24 44 64 05 25 45 65]

    punpckhdq       m4, m8, m3                ;[82 102 122 142 83 103 123 143 86 106 126 146 87 107 127 147]
    punpckldq       m8, m3                    ;[80 100 120 140 81 101 121 141 84 104 124 144 85 105 125 145]

    punpckhdq       m3, m1, m5                ;[12 32 52 72 13 33 53 73 16 36 56 76 17 37 57 77]
    punpckldq       m1, m5                    ;[10 30 50 70 11 31 51 71 14 34 54 74 15 35 55 75]

    punpckhdq       m5, m2, m6                ;[92 112 132 152 93 113 133 153 96 116 136 156 97 117 137 157]
    punpckldq       m2, m6                    ;[90 110 130 150 91 111 131 151 94 114 134 154 95 115 135 155]

    punpckhqdq      m6, m0, m8                ;[01 21 41 61 81 101 121 141 05 25 45 65 85 105 125 145]
    punpcklqdq      m0, m8                    ;[00 20 40 60 80 100 120 140 04 24 44 64 84 104 124 144]

    punpckhqdq      m8, m7, m4                ;[03 23 43 63 43 103 123 143 07 27 47 67 87 107 127 147]
    punpcklqdq      m7, m4                    ;[02 22 42 62 82 102 122 142 06 26 46 66 86 106 126 146]

    punpckhqdq      m4, m1, m2                ;[11 31 51 71 91 111 131 151 15 35 55 75 95 115 135 155]
    punpcklqdq      m1, m2                    ;[10 30 50 70 90 110 130 150 14 34 54 74 94 114 134 154]

    punpckhqdq      m2, m3, m5                ;[13 33 53 73 93 113 133 153 17 37 57 77 97 117 137 157]
    punpcklqdq      m3, m5                    ;[12 32 52 72 92 112 132 152 16 36 56 76 96 116 136 156]

    IDCT_PASS1      0, 14
    IDCT_PASS1      2, 12
    IDCT_PASS1      4, 10
    IDCT_PASS1      6, 8

    add             r0, 16
    add             r3, 16
    dec             r4d
    jnz             .pass1

    mov             r3, rsp
    mov             r4d, 8
    lea             r5, [tab_idct16_2]
    lea             r6, [tab_idct16_1]

    vbroadcasti128  m7,  [r5]
    vbroadcasti128  m8,  [r5 + 16]
    vbroadcasti128  m9,  [r5 + 32]
    vbroadcasti128  m10, [r5 + 48]
    vbroadcasti128  m11, [r5 + 64]
    vbroadcasti128  m12, [r5 + 80]
    vbroadcasti128  m13, [r5 + 96]

.pass2:
    movu            m1, [r3]
    vpermq          m0, m1, 0xD8

    pmaddwd         m1, m0, m7
    pmaddwd         m2, m0, m8
    phaddd          m1, m2

    pmaddwd         m2, m0, m9
    pmaddwd         m3, m0, m10
    phaddd          m2, m3

    phaddd          m1, m2

    pmaddwd         m2, m0, m11
    pmaddwd         m3, m0, m12
    phaddd          m2, m3

    vbroadcasti128  m14, [r5 + 112]
    pmaddwd         m3, m0, m13
    pmaddwd         m4, m0, m14
    phaddd          m3, m4

    phaddd          m2, m3

    movu            m3, [r3 + 32]
    vpermq          m0, m3, 0xD8

    vbroadcasti128  m14, [r6]
    pmaddwd         m3, m0, m14
    vbroadcasti128  m14, [r6 + 16]
    pmaddwd         m4, m0, m14
    phaddd          m3, m4

    vbroadcasti128  m14, [r6 + 32]
    pmaddwd         m4, m0, m14
    vbroadcasti128  m14, [r6 + 48]
    pmaddwd         m5, m0, m14
    phaddd          m4, m5

    phaddd          m3, m4

    vbroadcasti128  m14, [r6 + 64]
    pmaddwd         m4, m0, m14
    vbroadcasti128  m14, [r6 + 80]
    pmaddwd         m5, m0, m14
    phaddd          m4, m5

    vbroadcasti128  m14, [r6 + 96]
    pmaddwd         m6, m0, m14
    vbroadcasti128  m14, [r6 + 112]
    pmaddwd         m0, m14
    phaddd          m6, m0

    phaddd          m4, m6

    paddd           m5, m1, m3
    paddd           m5, m15
    psrad           m5, IDCT_SHIFT2

    psubd           m1, m3
    paddd           m1, m15
    psrad           m1, IDCT_SHIFT2

    paddd           m6, m2, m4
    paddd           m6, m15
    psrad           m6, IDCT_SHIFT2

    psubd           m2, m4
    paddd           m2, m15
    psrad           m2, IDCT_SHIFT2

    packssdw        m5, m6
    packssdw        m1, m2
    pshufb          m2, m1, [dct16_shuf1]

    mova            [r1], xm5
    mova            [r1 + 16], xm2
    vextracti128    [r1 + r2], m5, 1
    vextracti128    [r1 + r2 + 16], m2, 1

    lea             r1, [r1 + 2 * r2]
    add             r3, 64
    dec             r4d
    jnz             .pass2
    RET


%macro IDCT16_AVX512_PASS1 3
    movu            m5,  [tab_AVX512_idct16_2 + %1 * 64]
    pmaddwd         m9, m4, m5
    pmaddwd         m10, m6, m5

    vpsrldq         m16,   m9, 4
    paddd            m9,  m16
    vpslldq         m17,   m10, 4
    paddd            m10,  m17
    vmovdqu32        m9   {k1}, m10

    pmaddwd         m10, m7, m5
    pmaddwd         m11, m8, m5

    vpsrldq         m16,   m10, 4
    paddd            m10,  m16
    vpslldq         m17,   m11, 4
    paddd            m11,  m17
    vmovdqu32        m10   {k1}, m11

    vpsrldq         m16,   m9, 8
    paddd            m9,  m16
    vpslldq         m17,   m10, 8
    paddd            m10,  m17
    vmovdqu32        m9   {k2}, m10

    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64]
    pmaddwd         m10, m28, m5
    pmaddwd         m11, m29, m5

    vpsrldq         m16,   m10, 4
    paddd            m10,  m16
    vpslldq         m17,   m11, 4
    paddd            m11,  m17
    vmovdqu32        m10   {k1}, m11

    pmaddwd         m11, m30, m5
    pmaddwd         m12, m31, m5

    vpsrldq         m16,   m11, 4
    paddd            m11,  m16
    vpslldq         m17,   m12, 4
    paddd            m12,  m17
    vmovdqu32        m11   {k1}, m12

    vpsrldq         m16,   m10, 8
    paddd            m10,  m16
    vpslldq         m17,   m11, 8
    paddd            m11,  m17
    vmovdqu32        m10   {k2}, m11

    paddd           m11, m9, m10
    paddd           m11, m14
    psrad           m11, IDCT_SHIFT1

    psubd           m9, m10
    paddd           m9, m14
    psrad           m9, IDCT_SHIFT1

    mova            m5,  [tab_AVX512_idct16_2 + %1 * 64 + 64]
    pmaddwd         m10, m4, m5
    pmaddwd         m12, m6, m5


    vpsrldq         m16,   m10, 4
    paddd            m10,  m16
    vpslldq         m17,   m12, 4
    paddd            m12,  m17
    vmovdqu32        m10   {k1}, m12

    pmaddwd         m12, m7, m5
    pmaddwd         m13, m8, m5


    vpsrldq         m16,   m12, 4
    paddd            m12,  m16
    vpslldq         m17,   m13, 4
    paddd            m13,  m17
    vmovdqu32        m12   {k1}, m13


    vpsrldq         m16,   m10, 8
    paddd            m10,  m16
    vpslldq         m17,   m12, 8
    paddd            m12,  m17
    vmovdqu32        m10   {k2}, m12



    mova            m5,  [tab_AVX512_idct16_1 + %1 * 64 + 64] 
    pmaddwd         m12, m28, m5
    pmaddwd         m13, m29, m5


    vpsrldq         m16,   m12, 4
    paddd            m12,  m16
    vpslldq         m17,   m13, 4
    paddd            m13,  m17
    vmovdqu32        m12   {k1}, m13

    pmaddwd         m13, m30, m5
    pmaddwd         m5, m31


    vpsrldq         m16,   m13, 4
    paddd            m13,  m16
    vpslldq         m17,   m5, 4
    paddd            m5,  m17
    vmovdqu32        m13   {k1}, m5


    vpsrldq         m16,   m12, 8
    paddd            m12,  m16
    vpslldq         m17,   m13, 8
    paddd            m13,  m17
    vmovdqu32        m12   {k2}, m13


    paddd           m5, m10, m12
    paddd           m5, m14
    psrad           m5, IDCT_SHIFT1

    psubd           m10, m12
    paddd           m10, m14
    psrad           m10, IDCT_SHIFT1

    packssdw        m11, m5
    packssdw        m9, m10

    mova            m10, [idct16_AVX512_shuff]
    mova            m5,  [idct16_AVX512_shuff1]

    vpermd          m%2, m10, m11
    vpermd          m%3, m5, m9
%endmacro

%macro IDCT16_AVX512_PASS2 2
    vpermq          m0, m%1, 0xD8

    pmaddwd         m1, m0, m7
    pmaddwd         m2, m0, m8


    vpsrldq         m14,   m1, 4
    paddd            m1,  m14
    vpslldq         m31,   m2, 4
    paddd            m2,  m31
    vmovdqu32        m1   {k1}, m2

    pmaddwd         m2, m0, m9
    pmaddwd         m3, m0, m10


    vpsrldq         m14,   m2, 4
    paddd            m2,  m14
    vpslldq         m31,   m3, 4
    paddd            m3,  m31
    vmovdqu32        m2   {k1}, m3


    vpsrldq         m14,   m1, 8
    paddd            m1,  m14
    vpslldq         m31,   m2, 8
    paddd            m2,  m31
    vmovdqu32        m1   {k2}, m2

    pmaddwd         m2, m0, m11
    pmaddwd         m3, m0, m12


    vpsrldq         m14,   m2, 4
    paddd            m2,  m14
    vpslldq         m31,   m3, 4
    paddd            m3,  m31
    vmovdqu32        m2   {k1}, m3

    vbroadcasti64x2  m14, [r5 + 112]
    pmaddwd         m3, m0, m13
    pmaddwd         m4, m0, m14


    vpsrldq         m14,   m3, 4
    paddd            m3,  m14
    vpslldq         m31,   m4, 4
    paddd            m4,  m31
    vmovdqu32        m3   {k1}, m4


    vpsrldq         m14,   m2, 8
    paddd            m2,  m14
    vpslldq         m31,   m3, 8
    paddd            m3,  m31
    vmovdqu32        m2   {k2}, m3

    vpermq          m0, m%2, 0xD8
    pmaddwd         m3, m0, m16
    pmaddwd         m4, m0, m17


    vpsrldq         m14,   m3, 4
    paddd            m3,  m14
    vpslldq         m31,   m4, 4
    paddd            m4,  m31
    vmovdqu32        m3   {k1}, m4

    pmaddwd         m4, m0, m19
    pmaddwd         m5, m0, m23


    vpsrldq         m14,   m4, 4
    paddd            m4,  m14
    vpslldq         m31,   m5, 4
    paddd            m5,  m31
    vmovdqu32        m4   {k1}, m5


    vpsrldq         m14,   m3, 8
    paddd            m3,  m14
    vpslldq         m31,   m4, 8
    paddd            m4,  m31
    vmovdqu32        m3   {k2}, m4


    pmaddwd         m4, m0, m28
    pmaddwd         m5, m0, m29

    vpsrldq         m14,   m4, 4
    paddd            m4,  m14
    vpslldq         m31,   m5, 4
    paddd            m5,  m31
    vmovdqu32        m4   {k1}, m5

    pmaddwd         m6, m0, m30
    vbroadcasti64x2  m31, [r6 + 112]
    pmaddwd         m0, m31


    vpsrldq         m14,   m6, 4
    paddd            m6,  m14
    vpslldq         m31,   m0, 4
    paddd            m0,  m31
    vmovdqu32        m6   {k1}, m0


    vpsrldq         m14,   m4, 8
    paddd            m4,  m14
    vpslldq         m31,   m6, 8
    paddd            m6,  m31
    vmovdqu32        m4   {k2}, m6

    paddd           m5, m1, m3
    paddd           m5, m15
    psrad           m5, IDCT_SHIFT2

    psubd           m1, m3
    paddd           m1, m15
    psrad           m1, IDCT_SHIFT2

    paddd           m6, m2, m4
    paddd           m6, m15
    psrad           m6, IDCT_SHIFT2

    psubd           m2, m4
    paddd           m2, m15
    psrad           m2, IDCT_SHIFT2

    packssdw        m5, m6
    packssdw        m1, m2
    pshufb          m2, m1, [idct16_AVX512_shuff6]
%endmacro


;-------------------------------------------------------
; void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_ZMM avx512
cglobal idct16, 3, 8, 32
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m15,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m15,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m15,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
%define             IDCT_SHIFT1         7

    vpbroadcastd    m14,               [pd_64]

    add             r2d,               r2d

    mov             r7d,    0xAAAA
    kmovd            k1,    r7d
    mov             r7d,    0xCCCC
    kmovd            k2,    r7d
    mova          ym2, [idct16_shuff2]
    mova          ym3, [idct16_shuff3]
    mova         ym26, [idct16_shuff4]
    mova         ym27, [idct16_shuff5]

.pass1:
    movu          xm0, [r0 + 0 * 32]
    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
    movu          xm1, [r0 + 2 * 32]
    vinserti128   ym1, ym1, [r0 + 10 * 32], 1

    mova          ym9, ym2
    mova         ym10, ym3
    vpermi2w      ym9, ym0, ym1
    vpermi2w     ym10, ym0, ym1

    movu          xm0, [r0 + 4 * 32]
    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
    movu          xm1, [r0 + 6 * 32]
    vinserti128   ym1, ym1, [r0 + 14 * 32], 1

    mova         ym11, ym2
    mova         ym12, ym3
    vpermi2w     ym11, ym0,  ym1
    vpermi2w     ym12, ym0,  ym1

    mova         ym4,  ym26
    mova         ym6,  ym27
    vpermi2d     ym4,   ym9, ym11
    vpermi2d     ym6,   ym9, ym11

    mova         ym7, ym26
    mova         ym8, ym27
    vpermi2d     ym7, ym10, ym12
    vpermi2d     ym8, ym10, ym12

    vpermq       ym4, ym4,  q3120
    vpermq       ym6, ym6,  q3120
    vpermq       ym7, ym7,  q3120
    vpermq       ym8, ym8,  q3120

    movu          xm0, [r0 + 1 * 32]
    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
    movu          xm1, [r0 + 3 * 32]
    vinserti128   ym1, ym1, [r0 + 11 * 32], 1

    mova          ym9, ym2
    mova         ym10, ym3
    vpermi2w      ym9,  ym0, ym1
    vpermi2w     ym10,  ym0, ym1

    movu          xm0, [r0 + 5 * 32]
    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
    movu          xm1, [r0 + 7 * 32]
    vinserti128   ym1, ym1, [r0 + 15 * 32], 1

    mova         ym11,  ym2
    mova         ym12,  ym3
    vpermi2w     ym11,  ym0,  ym1
    vpermi2w     ym12,  ym0,  ym1

    mova         ym28,  ym26
    mova         ym29,  ym27
    vpermi2d     ym28,  ym9, ym11
    vpermi2d     ym29,  ym9, ym11

    mova         ym30, ym26
    mova         ym31, ym27
    vpermi2d     ym30, ym10, ym12
    vpermi2d     ym31, ym10, ym12

    vpermq       ym28, ym28,  q3120
    vpermq       ym29, ym29,  q3120
    vpermq       ym30, ym30,  q3120
    vpermq       ym31, ym31,  q3120

    vinserti64x4    m4,          m4,      ym4, 1
    vinserti64x4    m6,          m6,      ym6, 1
    vinserti64x4    m7,          m7,      ym7, 1
    vinserti64x4    m8,          m8,      ym8, 1
    vinserti64x4    m28,        m28,      ym28, 1
    vinserti64x4    m29,        m29,      ym29, 1
    vinserti64x4    m30,        m30,      ym30, 1
    vinserti64x4    m31,        m31,      ym31, 1

    IDCT16_AVX512_PASS1      0, 18, 19
    IDCT16_AVX512_PASS1      2, 20, 21

    add             r0, 16

    movu          xm0, [r0 + 0 * 32]
    vinserti128   ym0, ym0, [r0 + 8 * 32], 1
    movu          xm1, [r0 + 2 * 32]
    vinserti128   ym1, ym1, [r0 + 10 * 32], 1

    mova          ym9, ym2
    mova         ym10, ym3
    vpermi2w      ym9, ym0, ym1
    vpermi2w     ym10, ym0, ym1

    movu          xm0, [r0 + 4 * 32]
    vinserti128   ym0, ym0, [r0 + 12 * 32], 1
    movu          xm1, [r0 + 6 * 32]
    vinserti128   ym1, ym1, [r0 + 14 * 32], 1

    mova         ym11, ym2
    mova         ym12, ym3
    vpermi2w     ym11, ym0,  ym1
    vpermi2w     ym12, ym0,  ym1

    mova         ym4,  ym26
    mova         ym6,  ym27
    vpermi2d     ym4,   ym9, ym11
    vpermi2d     ym6,   ym9, ym11

    mova         ym7, ym26
    mova         ym8, ym27
    vpermi2d     ym7, ym10, ym12
    vpermi2d     ym8, ym10, ym12

    vpermq       ym4, ym4,  q3120
    vpermq       ym6, ym6,  q3120
    vpermq       ym7, ym7,  q3120
    vpermq       ym8, ym8,  q3120

    movu          xm0, [r0 + 1 * 32]
    vinserti128   ym0, ym0, [r0 + 9 * 32], 1
    movu          xm1, [r0 + 3 * 32]
    vinserti128   ym1, ym1, [r0 + 11 * 32], 1

    mova          ym9, ym2
    mova         ym10, ym3
    vpermi2w      ym9,  ym0, ym1
    vpermi2w     ym10,  ym0, ym1

    movu          xm0, [r0 + 5 * 32]
    vinserti128   ym0, ym0, [r0 + 13 * 32], 1
    movu          xm1, [r0 + 7 * 32]
    vinserti128   ym1, ym1, [r0 + 15 * 32], 1

    mova         ym11,  ym2
    mova         ym12,  ym3
    vpermi2w     ym11,  ym0,  ym1
    vpermi2w     ym12,  ym0,  ym1

    mova         ym28,  ym26
    mova         ym29,  ym27
    vpermi2d     ym28,  ym9, ym11
    vpermi2d     ym29,  ym9, ym11

    mova         ym30, ym26
    mova         ym31, ym27
    vpermi2d     ym30, ym10, ym12
    vpermi2d     ym31, ym10, ym12

    vpermq       ym28, ym28,  q3120
    vpermq       ym29, ym29,  q3120
    vpermq       ym30, ym30,  q3120
    vpermq       ym31, ym31,  q3120

    vinserti64x4    m4,          m4,      ym4, 1
    vinserti64x4    m6,          m6,      ym6, 1
    vinserti64x4    m7,          m7,      ym7, 1
    vinserti64x4    m8,          m8,      ym8, 1
    vinserti64x4    m28,        m28,      ym28, 1
    vinserti64x4    m29,        m29,      ym29, 1
    vinserti64x4    m30,        m30,      ym30, 1
    vinserti64x4    m31,        m31,      ym31, 1


    IDCT16_AVX512_PASS1      0, 22, 23
    IDCT16_AVX512_PASS1      2, 24, 25

    mova       m26,    [idct16_AVX512_shuff2]
    mova       m27,    [idct16_AVX512_shuff3]
    vpermi2q   m26,    m18, m22
    vpermi2q   m27,    m18, m22
    mova       m18,    [idct16_AVX512_shuff2]
    mova       m22,    [idct16_AVX512_shuff3]
    vpermi2q   m18,    m20, m24
    vpermi2q   m22,    m20, m24
    mova       m20,    [idct16_AVX512_shuff4]
    mova       m24,    [idct16_AVX512_shuff5]
    vpermi2q   m20,    m21, m25
    vpermi2q   m24,    m21, m25
    mova       m21,    [idct16_AVX512_shuff4]
    mova       m25,    [idct16_AVX512_shuff5]
    vpermi2q   m21,    m19, m23
    vpermi2q   m25,    m19, m23

    lea             r5, [tab_idct16_2]
    lea             r6, [tab_idct16_1]

    vbroadcasti64x2  m7,  [r5]
    vbroadcasti64x2  m8,  [r5 + 16]
    vbroadcasti64x2  m9,  [r5 + 32]
    vbroadcasti64x2  m10, [r5 + 48]
    vbroadcasti64x2  m11, [r5 + 64]
    vbroadcasti64x2  m12, [r5 + 80]
    vbroadcasti64x2  m13, [r5 + 96]

    vbroadcasti64x2  m16, [r6]
    vbroadcasti64x2  m17, [r6 + 16]
    vbroadcasti64x2  m19, [r6 + 32]
    vbroadcasti64x2  m23, [r6 + 48]
    vbroadcasti64x2  m28, [r6 + 64]
    vbroadcasti64x2  m29, [r6 + 80]
    vbroadcasti64x2  m30, [r6 + 96]


    IDCT16_AVX512_PASS2 26, 27
     mova            [r1], xm5
     mova            [r1 + 16], xm2
     vextracti128    [r1 + r2], ym5, 1
     vextracti128    [r1 + r2 + 16], ym2, 1
     vextracti64x4   ym14, m5, 1
     vextracti64x4   ym31, m2, 1
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm14
     mova            [r1 + 16], xm31
     vextracti128    [r1 + r2], ym14, 1
     vextracti128    [r1 + r2 + 16], ym31, 1

    IDCT16_AVX512_PASS2 18, 22
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm5
     mova            [r1 + 16], xm2
     vextracti128    [r1 + r2], ym5, 1
     vextracti128    [r1 + r2 + 16], ym2, 1
     vextracti64x4   ym14, m5, 1
     vextracti64x4   ym31, m2, 1
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm14
     mova            [r1 + 16], xm31
     vextracti128    [r1 + r2], ym14, 1
     vextracti128    [r1 + r2 + 16], ym31, 1

    IDCT16_AVX512_PASS2 20, 24
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm5
     mova            [r1 + 16], xm2
     vextracti128    [r1 + r2], ym5, 1
     vextracti128    [r1 + r2 + 16], ym2, 1
     vextracti64x4   ym14, m5, 1
     vextracti64x4   ym31, m2, 1
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm14
     mova            [r1 + 16], xm31
     vextracti128    [r1 + r2], ym14, 1
     vextracti128    [r1 + r2 + 16], ym31, 1

    IDCT16_AVX512_PASS2 21, 25
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm5
     mova            [r1 + 16], xm2
     vextracti128    [r1 + r2], ym5, 1
     vextracti128    [r1 + r2 + 16], ym2, 1
     vextracti64x4   ym14, m5, 1
     vextracti64x4   ym31, m2, 1
     lea             r1, [r1 + 2 * r2]
     mova            [r1], xm14
     mova            [r1 + 16], xm31
     vextracti128    [r1 + r2], ym14, 1
     vextracti128    [r1 + r2 + 16], ym31, 1
    RET



%macro IDCT32_PASS1 1
    vbroadcasti128  m3, [tab_idct32_1 + %1 * 32]
    vbroadcasti128  m13, [tab_idct32_1 + %1 * 32 + 16]
    pmaddwd         m9, m4, m3
    pmaddwd         m10, m8, m13
    phaddd          m9, m10

    pmaddwd         m10, m2, m3
    pmaddwd         m11, m1, m13
    phaddd          m10, m11

    phaddd          m9, m10

    vbroadcasti128  m3, [tab_idct32_1 + (15 - %1) * 32]
    vbroadcasti128  m13, [tab_idct32_1 + (15- %1) * 32 + 16]
    pmaddwd         m10, m4, m3
    pmaddwd         m11, m8, m13
    phaddd          m10, m11

    pmaddwd         m11, m2, m3
    pmaddwd         m12, m1, m13
    phaddd          m11, m12

    phaddd          m10, m11
    phaddd          m9, m10                       ;[row0s0 row2s0 row0s15 row2s15 row1s0 row3s0 row1s15 row3s15]

    vbroadcasti128  m3, [tab_idct32_2 + %1 * 16]
    pmaddwd         m10, m0, m3
    pmaddwd         m11, m7, m3
    phaddd          m10, m11
    phaddd          m10, m10

    vbroadcasti128  m3, [tab_idct32_3 + %1 * 16]
    pmaddwd         m11, m5, m3
    pmaddwd         m12, m6, m3
    phaddd          m11, m12
    phaddd          m11, m11

    paddd           m12, m10, m11                 ;[row0a0 row2a0 NIL NIL row1sa0 row3a0 NIL NIL]
    psubd           m10, m11                      ;[row0a15 row2a15 NIL NIL row1a15 row3a15 NIL NIL]

    punpcklqdq      m12, m10                      ;[row0a0 row2a0 row0a15 row2a15 row1a0 row3a0 row1a15 row3a15]
    paddd           m10, m9, m12
    paddd           m10, m15
    psrad           m10, IDCT_SHIFT1

    psubd           m12, m9
    paddd           m12, m15
    psrad           m12, IDCT_SHIFT1

    packssdw        m10, m12
    vextracti128    xm12, m10, 1
    movd            [r3 + %1 * 64], xm10
    movd            [r3 + 32 + %1 * 64], xm12
    pextrd          [r4 - %1 * 64], xm10, 1
    pextrd          [r4+ 32 - %1 * 64], xm12, 1
    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2
%endmacro

;-------------------------------------------------------
; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------

; TODO: Reduce PHADDD instruction by PADDD

INIT_YMM avx2
cglobal idct32, 3, 6, 16, 0-32*64

%define             IDCT_SHIFT1         7

    vbroadcasti128  m15, [pd_64]

    mov             r3, rsp
    lea             r4, [r3 + 15 * 64]
    mov             r5d, 8

.pass1:
    movq            xm0,    [r0 +  2 * 64]
    movq            xm1,    [r0 + 18 * 64]
    punpcklqdq      xm0, xm0, xm1
    movq            xm1,    [r0 +  0 * 64]
    movq            xm2,    [r0 + 16 * 64]
    punpcklqdq      xm1, xm1, xm2
    vinserti128     m0,  m0,  xm1, 1             ;[2 18 0 16]

    movq            xm1,    [r0 + 1 * 64]
    movq            xm2,    [r0 + 9 * 64]
    punpcklqdq      xm1, xm1, xm2
    movq            xm2,    [r0 + 17 * 64]
    movq            xm3,    [r0 + 25 * 64]
    punpcklqdq      xm2, xm2, xm3
    vinserti128     m1,  m1,  xm2, 1             ;[1 9 17 25]

    movq            xm2,    [r0 + 6 * 64]
    movq            xm3,    [r0 + 22 * 64]
    punpcklqdq      xm2, xm2, xm3
    movq            xm3,    [r0 + 4 * 64]
    movq            xm4,    [r0 + 20 * 64]
    punpcklqdq      xm3, xm3, xm4
    vinserti128     m2,  m2,  xm3, 1             ;[6 22 4 20]

    movq            xm3,    [r0 + 3 * 64]
    movq            xm4,    [r0 + 11 * 64]
    punpcklqdq      xm3, xm3, xm4
    movq            xm4,    [r0 + 19 * 64]
    movq            xm5,    [r0 + 27 * 64]
    punpcklqdq      xm4, xm4, xm5
    vinserti128     m3,  m3,  xm4, 1             ;[3 11 17 25]

    movq            xm4,    [r0 + 10 * 64]
    movq            xm5,    [r0 + 26 * 64]
    punpcklqdq      xm4, xm4, xm5
    movq            xm5,    [r0 + 8 * 64]
    movq            xm6,    [r0 + 24 * 64]
    punpcklqdq      xm5, xm5, xm6
    vinserti128     m4,  m4,  xm5, 1             ;[10 26 8 24]

    movq            xm5,    [r0 + 5 * 64]
    movq            xm6,    [r0 + 13 * 64]
    punpcklqdq      xm5, xm5, xm6
    movq            xm6,    [r0 + 21 * 64]
    movq            xm7,    [r0 + 29 * 64]
    punpcklqdq      xm6, xm6, xm7
    vinserti128     m5,  m5,  xm6, 1             ;[5 13 21 9]

    movq            xm6,    [r0 + 14 * 64]
    movq            xm7,    [r0 + 30 * 64]
    punpcklqdq      xm6, xm6, xm7
    movq            xm7,    [r0 + 12 * 64]
    movq            xm8,    [r0 + 28 * 64]
    punpcklqdq      xm7, xm7, xm8
    vinserti128     m6,  m6,  xm7, 1             ;[14 30 12 28]

    movq            xm7,    [r0 + 7 * 64]
    movq            xm8,    [r0 + 15 * 64]
    punpcklqdq      xm7, xm7, xm8
    movq            xm8,    [r0 + 23 * 64]
    movq            xm9,    [r0 + 31 * 64]
    punpcklqdq      xm8, xm8, xm9
    vinserti128     m7,  m7,  xm8, 1             ;[7 15 23 31]

    punpckhwd       m8, m0, m2                  ;[18 22 16 20]
    punpcklwd       m0, m2                      ;[2 6 0 4]

    punpckhwd       m2, m1, m3                  ;[9 11 25 27]
    punpcklwd       m1, m3                      ;[1 3 17 19]

    punpckhwd       m3, m4, m6                  ;[26 30 24 28]
    punpcklwd       m4, m6                      ;[10 14 8 12]

    punpckhwd       m6, m5, m7                  ;[13 15 29 31]
    punpcklwd       m5, m7                      ;[5 7 21 23]

    punpckhdq       m7, m0, m4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
    punpckldq       m0, m4                      ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]

    punpckhdq       m4, m8, m3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
    punpckldq       m8, m3                      ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]

    punpckhdq       m3, m1, m5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
    punpckldq       m1, m5                      ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]

    punpckhdq       m5, m2, m6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
    punpckldq       m2, m6                      ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]

    punpckhqdq      m6, m0, m8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
    punpcklqdq      m0, m8                      ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]

    punpckhqdq      m8, m7, m4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
    punpcklqdq      m7, m4                      ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]

    punpckhqdq      m4, m1, m2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
    punpcklqdq      m1, m2                      ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]

    punpckhqdq      m2, m3, m5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
    punpcklqdq      m3, m5                      ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]

    vperm2i128      m5, m0, m6, 0x20            ;[20 60 100 140 180 220 260 300 21 61 101 141 181 221 261 301]
    vperm2i128      m0, m0, m6, 0x31            ;[00 40 80 120 160 200 240 280 01 41 81 121 161 201 241 281]

    vperm2i128      m6, m7, m8, 0x20            ;[22 62 102 142 182 222 262 302 23 63 103 143 183 223 263 303]
    vperm2i128      m7, m7, m8, 0x31            ;[02 42 82 122 162 202 242 282 03 43 83 123 163 203 243 283]

    vperm2i128      m8, m1, m4, 0x31            ;[170 190 210 230 250 270 290 310 171 191 211 231 251 271 291 311]
    vperm2i128      m4, m1, m4, 0x20            ;[10 30 50 70 90 110 130 150 11 31 51 71 91 111 131 151]

    vperm2i128      m1, m3, m2, 0x31            ;[172 192 212 232 252 272 292 312 173 193 213 233 253 273 293 313]
    vperm2i128      m2, m3, m2, 0x20            ;[12 32 52 72 92 112 132 152 13 33 53 73 93 113 133 153]

    IDCT32_PASS1 0
    IDCT32_PASS1 1
    IDCT32_PASS1 2
    IDCT32_PASS1 3
    IDCT32_PASS1 4
    IDCT32_PASS1 5
    IDCT32_PASS1 6
    IDCT32_PASS1 7

    add             r0, 8
    add             r3, 4
    add             r4, 4
    dec             r5d
    jnz             .pass1

%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m15,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m15,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m15,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif

    mov             r3, rsp
    add             r2d, r2d
    mov             r4d, 32

    mova            m7,  [tab_idct32_4]
    mova            m8,  [tab_idct32_4 + 32]
    mova            m9,  [tab_idct32_4 + 64]
    mova            m10, [tab_idct32_4 + 96]
    mova            m11, [tab_idct32_4 + 128]
    mova            m12, [tab_idct32_4 + 160]
    mova            m13, [tab_idct32_4 + 192]
    mova            m14, [tab_idct32_4 + 224]
.pass2:
    movu            m0, [r3]
    movu            m1, [r3 + 32]

    pmaddwd         m2, m0, m7
    pmaddwd         m3, m0, m8
    phaddd          m2, m3

    pmaddwd         m3, m0, m9
    pmaddwd         m4, m0, m10
    phaddd          m3, m4

    phaddd          m2, m3

    pmaddwd         m3, m0, m11
    pmaddwd         m4, m0, m12
    phaddd          m3, m4

    pmaddwd         m4, m0, m13
    pmaddwd         m5, m0, m14
    phaddd          m4, m5

    phaddd          m3, m4

    vperm2i128      m4, m2, m3, 0x31
    vperm2i128      m2, m2, m3, 0x20
    paddd           m2, m4

    pmaddwd         m3, m0, [tab_idct32_4 + 256]
    pmaddwd         m4, m0, [tab_idct32_4 + 288]
    phaddd          m3, m4

    pmaddwd         m4, m0, [tab_idct32_4 + 320]
    pmaddwd         m5, m0, [tab_idct32_4 + 352]
    phaddd          m4, m5

    phaddd          m3, m4

    pmaddwd         m4, m0, [tab_idct32_4 + 384]
    pmaddwd         m5, m0, [tab_idct32_4 + 416]
    phaddd          m4, m5

    pmaddwd         m5, m0, [tab_idct32_4 + 448]
    pmaddwd         m0,     [tab_idct32_4 + 480]
    phaddd          m5, m0

    phaddd          m4, m5

    vperm2i128      m0, m3, m4, 0x31
    vperm2i128      m3, m3, m4, 0x20
    paddd           m3, m0

    pmaddwd         m4, m1, [tab_idct32_1]
    pmaddwd         m0, m1, [tab_idct32_1 + 32]
    phaddd          m4, m0

    pmaddwd         m5, m1, [tab_idct32_1 + 64]
    pmaddwd         m0, m1, [tab_idct32_1 + 96]
    phaddd          m5, m0

    phaddd          m4, m5

    pmaddwd         m5, m1, [tab_idct32_1 + 128]
    pmaddwd         m0, m1, [tab_idct32_1 + 160]
    phaddd          m5, m0

    pmaddwd         m6, m1, [tab_idct32_1 + 192]
    pmaddwd         m0, m1, [tab_idct32_1 + 224]
    phaddd          m6, m0

    phaddd          m5, m6

    vperm2i128      m0, m4, m5, 0x31
    vperm2i128      m4, m4, m5, 0x20
    paddd           m4, m0

    pmaddwd         m5, m1, [tab_idct32_1 + 256]
    pmaddwd         m0, m1, [tab_idct32_1 + 288]
    phaddd          m5, m0

    pmaddwd         m6, m1, [tab_idct32_1 + 320]
    pmaddwd         m0, m1, [tab_idct32_1 + 352]
    phaddd          m6, m0

    phaddd          m5, m6

    pmaddwd         m6, m1, [tab_idct32_1 + 384]
    pmaddwd         m0, m1, [tab_idct32_1 + 416]
    phaddd          m6, m0

    pmaddwd         m0, m1, [tab_idct32_1 + 448]
    pmaddwd         m1,     [tab_idct32_1 + 480]
    phaddd          m0, m1

    phaddd          m6, m0

    vperm2i128      m0, m5, m6, 0x31
    vperm2i128      m5, m5, m6, 0x20
    paddd           m5, m0

    paddd           m6, m2, m4
    paddd           m6, m15
    psrad           m6, IDCT_SHIFT2

    psubd           m2, m4
    paddd           m2, m15
    psrad           m2, IDCT_SHIFT2

    paddd           m4, m3, m5
    paddd           m4, m15
    psrad           m4, IDCT_SHIFT2

    psubd           m3, m5
    paddd           m3, m15
    psrad           m3, IDCT_SHIFT2

    packssdw        m6, m4
    packssdw        m2, m3

    vpermq          m6, m6, 0xD8
    vpermq          m2, m2, 0x8D
    pshufb          m2, [dct16_shuf1]

    mova            [r1], m6
    mova            [r1 + 32], m2

    add             r1, r2
    add             r3, 64
    dec             r4d
    jnz             .pass2
    RET


%macro IDCT32_AVX512_PASS1 5
    pmaddwd         m9,  m8, m%4
    pmaddwd         m10, m7, m%5

    paddd            m9,  m10
    vpsrldq          m0,   m9, 8
    paddd            m9,   m0
    vpsrldq          m0,   m9, 4
    paddd            m9,   m0

    pmaddwd         m10, m4, m%4
    pmaddwd         m11, m1, m%5

    paddd           m10,   m11
    vpsrldq          m0,   m10, 8
    paddd           m10,   m0
    vpslldq          m0,   m10, 4
    paddd           m10,    m0

    vmovdqu32       m9 {k3}, m10

    mova            m6,  [tab_idct32_AVX512_5 + %1 * 64]
    mova            m5,  [tab_idct32_AVX512_5 + %1 * 64 + 64]

    pmaddwd         m10, m8, m6
    pmaddwd         m11, m7, m5

    paddd           m10,  m11
    vpslldq         m0,   m10, 8
    paddd           m10,   m0
    vpsrldq          m0,  m10, 4
    paddd           m10,   m0

    pmaddwd         m11, m4, m6
    pmaddwd         m12, m1, m5

    paddd           m11,   m12
    vpslldq          m0,   m11, 8
    paddd           m11,    m0
    vpslldq          m0,   m11, 4
    paddd           m11,    m0

    vmovdqu32        m10  {k4},  m11
    vmovdqu32        m9  {k2}, m10

    pmaddwd         m10, m3, m%2
    pmaddwd         m11, m14, m%2

    vpsrldq          m0,   m10, 4
    paddd           m10,    m0
    vpslldq          m5,   m11, 4
    paddd           m11,    m5
    vmovdqu32       m10   {k1}, m11

    vpsrldq         m0,    m10, 8
    paddd           m10,    m0

    pmaddwd         m11, m2, m%3
    pmaddwd         m12, m13, m%3

    vpsrldq          m0,   m11, 4
    paddd           m11,    m0
    vpslldq          m5,   m12, 4
    paddd           m12,    m5
    vmovdqu32       m11   {k1}, m12

    vpsrldq          m0,   m11, 8
    paddd           m11,    m0

    paddd           m12, m10, m11
    psubd           m10, m11

    punpcklqdq      m12, m10
    paddd           m10, m9, m12
    paddd           m10, m15
    psrad           m10, IDCT_SHIFT1

    psubd           m12, m9
    paddd           m12, m15
    psrad           m12, IDCT_SHIFT1

    packssdw        m10, m12
    vextracti128    xm12, m10, 1
    vextracti64x4   ym5,  m10, 1
    vextracti128    xm0, ym5, 1

    movd            [r3 + %1 * 64], xm10
    movd            [r3 + 32 + %1 * 64], xm12
    pextrd          [r4 - %1 * 64], xm10, 1
    pextrd          [r4+ 32 - %1 * 64], xm12, 1
    pextrd          [r3 + 16 * 64 + %1 *64], xm10, 3
    pextrd          [r3 + 16 * 64 + 32 + %1 * 64], xm12, 3
    pextrd          [r4 + 16 * 64 - %1 * 64], xm10, 2
    pextrd          [r4 + 16 * 64 + 32 - %1 * 64], xm12, 2

    movd            [r3 + (%1 + 1) * 64], xm5
    movd            [r3 + 32 + (%1 + 1) * 64], xm0
    pextrd          [r4 - (%1 + 1) * 64], xm5, 1
    pextrd          [r4+ 32 - (%1 + 1) * 64], xm0, 1
    pextrd          [r3 + 16 * 64 + (%1 + 1) * 64], xm5, 3
    pextrd          [r3 + 16 * 64 + 32 + (%1 + 1) * 64], xm0, 3
    pextrd          [r4 + 16 * 64 - (%1 + 1) * 64], xm5, 2
    pextrd          [r4 + 16 * 64 + 32 - (%1 + 1) * 64], xm0, 2
%endmacro

%macro IDCT32_AVX512_PASS2 0
    pmaddwd         m2, m0, m7
    pmaddwd         m3, m0, m8

    vpsrldq         m24,   m2, 4
    paddd            m2,  m24
    vpslldq         m25,   m3, 4
    paddd            m3,  m25
    vmovdqu32        m2   {k1}, m3

    pmaddwd         m3, m0, m9
    pmaddwd         m4, m0, m10

    vpsrldq         m24,   m3, 4
    paddd            m3,  m24
    vpslldq         m25,   m4, 4
    paddd            m4,  m25
    vmovdqu32        m3   {k1}, m4

    vpsrldq         m24,   m2, 8
    paddd            m2,  m24
    vpslldq         m25,   m3, 8
    paddd            m3,  m25
    vmovdqu32        m2   {k2}, m3

    pmaddwd         m3, m0, m11
    pmaddwd         m4, m0, m12

    vpsrldq         m24,   m3, 4
    paddd            m3,  m24
    vpslldq         m25,   m4, 4
    paddd            m4,  m25
    vmovdqu32        m3   {k1}, m4

    pmaddwd         m4, m0, m13
    pmaddwd         m5, m0, m14

    vpsrldq         m24,   m4, 4
    paddd            m4,  m24
    vpslldq         m25,   m5, 4
    paddd            m5,  m25
    vmovdqu32        m4   {k1}, m5

    vpsrldq         m24,   m3, 8
    paddd            m3,  m24
    vpslldq         m25,   m4, 8
    paddd            m4,  m25
    vmovdqu32        m3   {k2}, m4

    mova           m24,        [idct16_AVX512_shuff3]
    mova           m25,        [idct16_AVX512_shuff2]
    vpermi2q       m24,        m2,       m3
    vpermi2q       m25,        m2,       m3
    paddd           m2, m25, m24

    pmaddwd         m3, m0, m16
    pmaddwd         m4, m0, m17

    vpsrldq         m24,   m3, 4
    paddd            m3,  m24
    vpslldq         m25,   m4, 4
    paddd            m4,  m25
    vmovdqu32        m3   {k1}, m4

    pmaddwd         m4, m0, m18
    pmaddwd         m5, m0, m19

    vpsrldq         m24,   m4, 4
    paddd            m4,  m24
    vpslldq         m25,   m5, 4
    paddd            m5,  m25
    vmovdqu32        m4   {k1}, m5

    vpsrldq         m24,   m3, 8
    paddd            m3,  m24
    vpslldq         m25,   m4, 8
    paddd            m4,  m25
    vmovdqu32        m3   {k2}, m4

    pmaddwd         m4, m0, m20
    pmaddwd         m5, m0, m21

    vpsrldq         m24,   m4, 4
    paddd            m4,  m24
    vpslldq         m25,   m5, 4
    paddd            m5,  m25
    vmovdqu32        m4   {k1}, m5

    pmaddwd         m5, m0, m22
    pmaddwd         m0,     m23

    vpsrldq         m24,   m5, 4
    paddd            m5,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m5   {k1}, m0

    vpsrldq         m24,   m4, 8
    paddd            m4,  m24
    vpslldq         m25,   m5, 8
    paddd            m5,  m25
    vmovdqu32        m4   {k2}, m5

    mova           m24,        [idct16_AVX512_shuff3]
    mova           m25,        [idct16_AVX512_shuff2]
    vpermi2q       m24,        m3,       m4
    vpermi2q       m25,        m3,       m4
    paddd           m3, m25, m24

    pmaddwd         m4, m1, m26
    pmaddwd         m0, m1, m27

    vpsrldq         m24,   m4, 4
    paddd            m4,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m4   {k1}, m0

    pmaddwd         m5, m1, m28
    pmaddwd         m0, m1, m29

    vpsrldq         m24,   m5, 4
    paddd            m5,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m5   {k1}, m0


    vpsrldq         m24,   m4, 8
    paddd            m4,  m24
    vpslldq         m25,   m5, 8
    paddd            m5,  m25
    vmovdqu32        m4   {k2}, m5

    pmaddwd         m5, m1, m30
    pmaddwd         m0, m1, m31

    vpsrldq         m24,   m5, 4
    paddd            m5,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m5   {k1}, m0

    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 6 * mmsize]
    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 7 * mmsize]

    vpsrldq         m24,   m6, 4
    paddd            m6,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m6   {k1}, m0

    vpsrldq         m24,   m5, 8
    paddd            m5,  m24
    vpslldq         m25,   m6, 8
    paddd            m6,  m25
    vmovdqu32        m5   {k2}, m6

    mova           m24,        [idct16_AVX512_shuff3]
    mova           m25,        [idct16_AVX512_shuff2]
    vpermi2q       m24,        m4,       m5
    vpermi2q       m25,        m4,       m5
    paddd           m4, m25, m24

    pmaddwd         m5, m1, [tab_idct32_AVX512_4 + 8 * mmsize]
    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 9 * mmsize]

    vpsrldq         m24,   m5, 4
    paddd            m5,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m5   {k1}, m0

    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 10 * mmsize]
    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 11 * mmsize]

    vpsrldq         m24,   m6, 4
    paddd            m6,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m6   {k1}, m0

    vpsrldq         m24,   m5, 8
    paddd            m5,  m24
    vpslldq         m25,   m6, 8
    paddd            m6,  m25
    vmovdqu32        m5   {k2}, m6

    pmaddwd         m6, m1, [tab_idct32_AVX512_4 + 12 * mmsize]
    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 13 * mmsize]

    vpsrldq         m24,   m6, 4
    paddd            m6,  m24
    vpslldq         m25,   m0, 4
    paddd            m0,  m25
    vmovdqu32        m6   {k1}, m0

    pmaddwd         m0, m1, [tab_idct32_AVX512_4 + 14 * mmsize]
    pmaddwd         m1,     [tab_idct32_AVX512_4 + 15 * mmsize]

    vpsrldq         m24,   m0, 4
    paddd            m0,  m24
    vpslldq         m25,   m1, 4
    paddd            m1,  m25
    vmovdqu32        m0   {k1}, m1

    vpsrldq         m24,   m6, 8
    paddd            m6,  m24
    vpslldq         m25,   m0, 8
    paddd            m0,  m25
    vmovdqu32        m6   {k2}, m0

    mova           m24,        [idct16_AVX512_shuff3]
    mova           m25,        [idct16_AVX512_shuff2]
    vpermi2q       m24,        m5,       m6
    vpermi2q       m25,        m5,       m6
    paddd           m5, m25, m24

    paddd           m6, m2, m4
    paddd           m6, m15
    psrad           m6, IDCT_SHIFT2

    psubd           m2, m4
    paddd           m2, m15
    psrad           m2, IDCT_SHIFT2

    paddd           m4, m3, m5
    paddd           m4, m15
    psrad           m4, IDCT_SHIFT2

    psubd           m3, m5
    paddd           m3, m15
    psrad           m3, IDCT_SHIFT2

    packssdw        m6, m4
    packssdw        m2, m3

    vpermq          m6, m6, 0xD8
    vpermq          m2, m2, 0x8D
    pshufb          m2, [idct16_AVX512_shuff6]
%endmacro

;-------------------------------------------------------------------
; void idct32(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------------------

INIT_ZMM avx512
cglobal idct32, 3, 8, 32, 0-32*64

%define             IDCT_SHIFT1         7

    vbroadcasti128  m15, [pd_64]

    mov             r3,  rsp
    lea             r4,  [r3 + 15 * 64]
    mov             r5d, 8
    mov             r7d, 0xAAAA
    kmovd            k1, r7d
    mov             r7d, 0xCCCC
    kmovd            k2, r7d
    mov             r7d, 0x2222
    kmovd            k3, r7d
    mov             r7d, 0x8888
    kmovd            k4, r7d


    mova            m16, [tab_idct32_AVX512_2 + 0 * 64]
    mova            m17, [tab_idct32_AVX512_2 + 1 * 64]
    mova            m18, [tab_idct32_AVX512_2 + 2 * 64]
    mova            m19, [tab_idct32_AVX512_2 + 3 * 64]

    mova            m20, [tab_idct32_AVX512_3 + 0 * 64]
    mova            m21, [tab_idct32_AVX512_3 + 1 * 64]
    mova            m22, [tab_idct32_AVX512_3 + 2 * 64]
    mova            m23, [tab_idct32_AVX512_3 + 3 * 64]

    mova            m24, [tab_idct32_AVX512_1 + 0 * 64]
    mova            m25, [tab_idct32_AVX512_1 + 1 * 64]
    mova            m26, [tab_idct32_AVX512_1 + 2 * 64]
    mova            m27, [tab_idct32_AVX512_1 + 3 * 64]
    mova            m28, [tab_idct32_AVX512_1 + 4 * 64]
    mova            m29, [tab_idct32_AVX512_1 + 5 * 64]
    mova            m30, [tab_idct32_AVX512_1 + 6 * 64]
    mova            m31, [tab_idct32_AVX512_1 + 7 * 64]

.pass1:
    movq            xm0,    [r0 +  2 * 64]
    movq            xm1,    [r0 + 18 * 64]
    punpcklqdq      xm0,    xm0,  xm1
    movq            xm1,    [r0 +  0 * 64]
    movq            xm2,    [r0 + 16 * 64]
    punpcklqdq      xm1,    xm1,  xm2
    vinserti128     ym0,    ym0,  xm1, 1             ;[2 18 0 16]

    movq            xm1,    [r0 + 1 * 64]
    movq            xm2,    [r0 + 9 * 64]
    punpcklqdq      xm1,    xm1,  xm2
    movq            xm2,    [r0 + 17 * 64]
    movq            xm3,    [r0 + 25 * 64]
    punpcklqdq      xm2,    xm2,  xm3
    vinserti128     ym1,    ym1,  xm2, 1             ;[1 9 17 25]

    movq            xm2,    [r0 + 6 * 64]
    movq            xm3,    [r0 + 22 * 64]
    punpcklqdq      xm2,    xm2,  xm3
    movq            xm3,    [r0 + 4 * 64]
    movq            xm4,    [r0 + 20 * 64]
    punpcklqdq      xm3,    xm3,  xm4
    vinserti128     ym2,    ym2,  xm3, 1             ;[6 22 4 20]

    movq            xm3,    [r0 + 3 * 64]
    movq            xm4,    [r0 + 11 * 64]
    punpcklqdq      xm3,    xm3,  xm4
    movq            xm4,    [r0 + 19 * 64]
    movq            xm5,    [r0 + 27 * 64]
    punpcklqdq      xm4,    xm4,  xm5
    vinserti128     ym3,    ym3,  xm4, 1             ;[3 11 17 25]

    movq            xm4,    [r0 + 10 * 64]
    movq            xm5,    [r0 + 26 * 64]
    punpcklqdq      xm4,    xm4,  xm5
    movq            xm5,    [r0 + 8 * 64]
    movq            xm6,    [r0 + 24 * 64]
    punpcklqdq      xm5,    xm5,  xm6
    vinserti128     ym4,    ym4,  xm5, 1             ;[10 26 8 24]

    movq            xm5,    [r0 + 5 * 64]
    movq            xm6,    [r0 + 13 * 64]
    punpcklqdq      xm5,    xm5,  xm6
    movq            xm6,    [r0 + 21 * 64]
    movq            xm7,    [r0 + 29 * 64]
    punpcklqdq      xm6,    xm6,  xm7
    vinserti128     ym5,    ym5,  xm6, 1             ;[5 13 21 9]

    movq            xm6,    [r0 + 14 * 64]
    movq            xm7,    [r0 + 30 * 64]
    punpcklqdq      xm6,    xm6,  xm7
    movq            xm7,    [r0 + 12 * 64]
    movq            xm8,    [r0 + 28 * 64]
    punpcklqdq      xm7,    xm7,  xm8
    vinserti128     ym6,    ym6,  xm7, 1             ;[14 30 12 28]

    movq            xm7,    [r0 + 7 * 64]
    movq            xm8,    [r0 + 15 * 64]
    punpcklqdq      xm7,    xm7,  xm8
    movq            xm8,    [r0 + 23 * 64]
    movq            xm9,    [r0 + 31 * 64]
    punpcklqdq      xm8,    xm8,  xm9
    vinserti128     ym7,    ym7,  xm8, 1             ;[7 15 23 31]

    punpckhwd       ym8, ym0, ym2                  ;[18 22 16 20]
    punpcklwd       ym0, ym2                       ;[2 6 0 4]

    punpckhwd       ym2, ym1, ym3                  ;[9 11 25 27]
    punpcklwd       ym1, ym3                       ;[1 3 17 19]

    punpckhwd       ym3, ym4, ym6                  ;[26 30 24 28]
    punpcklwd       ym4, ym6                       ;[10 14 8 12]

    punpckhwd       ym6, ym5, ym7                  ;[13 15 29 31]
    punpcklwd       ym5, ym7                       ;[5 7 21 23]

    punpckhdq       ym7, ym0, ym4                  ;[22 62 102 142 23 63 103 143 02 42 82 122 03 43 83 123]
    punpckldq       ym0, ym4                       ;[20 60 100 140 21 61 101 141 00 40 80 120 01 41 81 121]

    punpckhdq       ym4, ym8, ym3                  ;[182 222 262 302 183 223 263 303 162 202 242 282 163 203 243 283]
    punpckldq       ym8, ym3                       ;[180 220 260 300 181 221 261 301 160 200 240 280 161 201 241 281]

    punpckhdq       ym3, ym1, ym5                  ;[12 32 52 72 13 33 53 73 172 192 212 232 173 193 213 233]
    punpckldq       ym1, ym5                       ;[10 30 50 70 11 31 51 71 170 190 210 230 171 191 211 231]

    punpckhdq       ym5, ym2, ym6                  ;[92 112 132 152 93 113 133 153 252 272 292 312 253 273 293 313]
    punpckldq       ym2, ym6                       ;[90 110 130 150 91 111 131 151 250 270 290 310 251 271 291 311]

    punpckhqdq      ym6, ym0, ym8                  ;[21 61 101 141 181 221 261 301 01 41 81 121 161 201 241 281]
    punpcklqdq      ym0, ym8                       ;[20 60 100 140 180 220 260 300 00 40 80 120 160 200 240 280]

    punpckhqdq      ym8, ym7, ym4                  ;[23 63 103 143 183 223 263 303 03 43 83 123 163 203 243 283]
    punpcklqdq      ym7, ym4                       ;[22 62 102 142 182 222 262 302 02 42 82 122 162 202 242 282]

    punpckhqdq      ym4, ym1, ym2                  ;[11 31 51 71 91 111 131 151 171 191 211 231 251 271 291 311]
    punpcklqdq      ym1, ym2                       ;[10 30 50 70 90 110 130 150 170 190 210 230 250 270 290 310]

    punpckhqdq      ym2, ym3, ym5                  ;[13 33 53 73 93 113 133 153 173 193 213 233 253 273 293 313]
    punpcklqdq      ym3, ym5                       ;[12 32 52 72 92 112 132 152 172 192 212 232 252 272 292 312]

    vinserti64x4    m7,        m7,      ym7, 1
    vinserti64x4    m8,        m8,      ym8, 1
    movu           m13,        [idct16_AVX512_shuff2]
    movu           m14,        [idct16_AVX512_shuff3]
    vpermi2q       m13,        m7,       m8
    vpermi2q       m14,        m7,       m8

    vinserti64x4    m1,        m1,      ym1, 1
    vinserti64x4    m4,        m4,      ym4, 1
    movu            m7,        [idct16_AVX512_shuff3]
    movu            m8,        [idct16_AVX512_shuff2]
    vpermi2q        m7,        m1,       m4
    vpermi2q        m8,        m1,       m4

    vinserti64x4    m3,        m3,      ym3, 1
    vinserti64x4    m2,        m2,      ym2, 1
    movu            m1,        [idct16_AVX512_shuff3]
    movu            m4,        [idct16_AVX512_shuff2]
    vpermi2q        m1,        m3,       m2
    vpermi2q        m4,        m3,       m2

    vinserti64x4    m0,        m0,      ym0, 1
    vinserti64x4    m6,        m6,      ym6, 1
    movu            m2,        [idct16_AVX512_shuff2]
    movu            m3,        [idct16_AVX512_shuff3]
    vpermi2q        m2,        m0,       m6
    vpermi2q        m3,        m0,       m6


    IDCT32_AVX512_PASS1 0, 16, 20, 24, 25
    IDCT32_AVX512_PASS1 2, 17, 21, 26, 27
    IDCT32_AVX512_PASS1 4, 18, 22, 28, 29
    IDCT32_AVX512_PASS1 6, 19, 23, 30, 31

    add             r0, 8
    add             r3, 4
    add             r4, 4
    dec             r5d
    jnz             .pass1

%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m15,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m15,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m15,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif

    mov             r3,  rsp
    add             r2d, r2d
    mov             r4d, 16
    mov             r6d, 0xFFFF0000
    kmovd            k3, r6d

    mova            m7,  [tab_idct32_AVX512_6]
    mova            m8,  [tab_idct32_AVX512_6 + 1 * mmsize]
    mova            m9,  [tab_idct32_AVX512_6 + 2 * mmsize]
    mova            m10, [tab_idct32_AVX512_6 + 3 * mmsize]
    mova            m11, [tab_idct32_AVX512_6 + 4 * mmsize]
    mova            m12, [tab_idct32_AVX512_6 + 5 * mmsize]
    mova            m13, [tab_idct32_AVX512_6 + 6 * mmsize]
    mova            m14, [tab_idct32_AVX512_6 + 7 * mmsize]
    mova            m16, [tab_idct32_AVX512_6 + 8 * mmsize]
    mova            m17, [tab_idct32_AVX512_6 + 9 * mmsize]
    mova            m18, [tab_idct32_AVX512_6 + 10 * mmsize]
    mova            m19, [tab_idct32_AVX512_6 + 11 * mmsize]
    mova            m20, [tab_idct32_AVX512_6 + 12 * mmsize]
    mova            m21, [tab_idct32_AVX512_6 + 13 * mmsize]
    mova            m22, [tab_idct32_AVX512_6 + 14 * mmsize]
    mova            m23, [tab_idct32_AVX512_6 + 15 * mmsize]
    mova            m26, [tab_idct32_AVX512_4]
    mova            m27, [tab_idct32_AVX512_4 + 1 * mmsize]
    mova            m28, [tab_idct32_AVX512_4 + 2 * mmsize]
    mova            m29, [tab_idct32_AVX512_4 + 3 * mmsize]
    mova            m30, [tab_idct32_AVX512_4 + 4 * mmsize]
    mova            m31, [tab_idct32_AVX512_4 + 5 * mmsize]

.pass2:
    movu            ym0, [r3]
    movu            ym1, [r3 + 32]
    vmovdqu16        m0  {k3}, [r3 + 32]
    vmovdqu16        m1  {k3}, [r3 + 64]

    IDCT32_AVX512_PASS2
    movu            [r1],      ym6
    movu            [r1 + 32], ym2
    vextracti64x4   ym24,       m6, 1
    vextracti64x4   ym25,       m2, 1
    add             r1,         r2
    movu            [r1 ],     ym24
    movu            [r1 + 32], ym25

    add             r1, r2
    add             r3, 128
    dec             r4d
    jnz             .pass2
    RET

;-------------------------------------------------------
; void idct4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
INIT_YMM avx2
cglobal idct4, 3, 4, 6

%define             IDCT_SHIFT1         7
%if BIT_DEPTH == 12
    %define         IDCT_SHIFT2        8
    vpbroadcastd    m5,                [pd_128]
%elif BIT_DEPTH == 10
    %define         IDCT_SHIFT2        10
    vpbroadcastd    m5,                [pd_512]
%elif BIT_DEPTH == 8
    %define         IDCT_SHIFT2        12
    vpbroadcastd    m5,                [pd_2048]
%else
    %error Unsupported BIT_DEPTH!
%endif
    vbroadcasti128  m4, [pd_64]

    add             r2d, r2d
    lea             r3, [r2 * 3]

    movu            m0, [r0]                      ;[00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33]

    pshufb          m0, [idct4_shuf1]             ;[00 02 01 03 10 12 11 13 20 22 21 23 30 32 31 33]
    vextracti128    xm1, m0, 1                    ;[20 22 21 23 30 32 31 33]
    punpcklwd       xm2, xm0, xm1                 ;[00 20 02 22 01 21 03 23]
    punpckhwd       xm0, xm1                      ;[10 30 12 32 11 31 13 33]
    vinserti128     m2, m2, xm2, 1                ;[00 20 02 22 01 21 03 23 00 20 02 22 01 21 03 23]
    vinserti128     m0, m0, xm0, 1                ;[10 30 12 32 11 31 13 33 10 30 12 32 11 31 13 33]

    mova            m1, [avx2_idct4_1]
    mova            m3, [avx2_idct4_1 + 32]
    pmaddwd         m1, m2
    pmaddwd         m3, m0

    paddd           m0, m1, m3
    paddd           m0, m4
    psrad           m0, IDCT_SHIFT1               ;[00 20 10 30 01 21 11 31]

    psubd           m1, m3
    paddd           m1, m4
    psrad           m1, IDCT_SHIFT1               ;[03 23 13 33 02 22 12 32]

    packssdw        m0, m1                        ;[00 20 10 30 03 23 13 33 01 21 11 31 02 22 12 32]
    vmovshdup       m1, m0                        ;[10 30 10 30 13 33 13 33 11 31 11 31 12 32 12 32]
    vmovsldup       m0, m0                        ;[00 20 00 20 03 23 03 23 01 21 01 21 02 22 02 22]

    vpbroadcastq    m2, [avx2_idct4_2]
    vpbroadcastq    m3, [avx2_idct4_2 + 8]
    pmaddwd         m0, m2
    pmaddwd         m1, m3

    paddd           m2, m0, m1
    paddd           m2, m5
    psrad           m2, IDCT_SHIFT2               ;[00 01 10 11 30 31 20 21]

    psubd           m0, m1
    paddd           m0, m5
    psrad           m0, IDCT_SHIFT2               ;[03 02 13 12 33 32 23 22]

    pshufb          m0, [idct4_shuf2]             ;[02 03 12 13 32 33 22 23]
    punpcklqdq      m1, m2, m0                    ;[00 01 02 03 10 11 12 13]
    punpckhqdq      m2, m0                        ;[30 31 32 33 20 21 22 23]
    packssdw        m1, m2                        ;[00 01 02 03 30 31 32 33 10 11 12 13 20 21 22 23]
    vextracti128    xm0, m1, 1

    movq            [r1], xm1
    movq            [r1 + r2], xm0
    movhps          [r1 + 2 * r2], xm0
    movhps          [r1 + r3], xm1
    RET

;static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
;{
;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
;    const int scaleBits = SCALE_BITS - 2 * transformShift;
;    const uint32_t trSize = 1 << log2TrSize;

;    for (int y = 0; y < MLS_CG_SIZE; y++)
;    {
;        for (int x = 0; x < MLS_CG_SIZE; x++)
;        {
;             int signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
;             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
;             *totalUncodedCost += costUncoded[blkPos + x];
;             *totalRdCost += costUncoded[blkPos + x];
;        }
;        blkPos += trSize;
;    }
;}

;---------------------------------------------------------------------------------------------------------------------------------------------------------
; void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
;---------------------------------------------------------------------------------------------------------------------------------------------------------
INIT_ZMM avx512
cglobal nonPsyRdoQuant4, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5
;Row 1, 2
    movu           xm0,        [r0]
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1],       m1
    ;Row 3, 4
    movu           xm0,        [r0 + 16]
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1
    vfmadd213pd    m2,         m2,             m5
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1 + 64],  m1
    vextracti32x8  ym2,        m4,             1
    paddq          ym4,        ym2
    vextracti32x4  xm2,        m4,             1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm5
    paddq          xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET
INIT_ZMM avx512
cglobal nonPsyRdoQuant8, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12 + 8]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10 + 8]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8 + 8]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5

;Row 1, 2
    movq           xm0,        [r0]
    pinsrq         xm0,        [r0 + mmsize/4], 1
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1],       ym1
    vextracti32x8  [r1 + mmsize],  m1 ,        1

    ;Row 3, 4
    movq           xm0,        [r0 + mmsize/2]
    pinsrq         xm0,        [r0 + 3 * mmsize/4],      1
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1
    vfmadd213pd    m2,         m2,             m5
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1 + 2 * mmsize], ym1
    vextracti32x8  [r1 + 3 * mmsize], m1 ,     1

    vextracti32x8  ym2,        m4,             1
    paddq          ym4,        ym2
    vextracti32x4  xm2,        m4,             1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm5
    paddq          xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET
INIT_ZMM avx512
cglobal nonPsyRdoQuant16, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12 + 16]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10 + 16]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8 + 16]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5

;Row 1, 2
    movq           xm0,        [r0]
    pinsrq         xm0,        [r0 + mmsize/2],       1
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1],       ym1
    vextracti32x8  [r1 + 2 * mmsize],  m1,     1

    ;Row 3, 4
    movq           xm0,        [r0 + mmsize]
    pinsrq         xm0,        [r0 + 3 * mmsize/2],      1
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1
    vfmadd213pd    m2,         m2,             m5
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1 + 4 * mmsize],         ym1
    vextracti32x8  [r1 + 6 * mmsize],          m1 ,            1

    vextracti32x8  ym2,        m4,             1
    paddq          ym4,        ym2
    vextracti32x4  xm2,        m4,             1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm5
    paddq          xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET
INIT_ZMM avx512
cglobal nonPsyRdoQuant32, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12 + 24]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10 + 24]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8 + 24]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5

;Row 1, 2
    movq           xm0,        [r0]
    pinsrq         xm0,        [r0 + mmsize],  1
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1],       ym1
    vextracti32x8  [r1 + 4 * mmsize],  m1,     1

    ;Row 3, 4
    movq           xm0,        [r0 + 2 * mmsize]
    pinsrq         xm0,        [r0 + 3 * mmsize],      1
    vpmovsxwq      m1,         xm0
    vcvtqq2pd      m2,         m1
    vfmadd213pd    m2,         m2,             m5
    vcvtpd2qq      m1,         m2
    vpsllq         m1,         xm3                              ; costUncoded
    paddq          m4,         m1
    movu           [r1 + 8 * mmsize],         ym1
    vextracti32x8  [r1 + 12 * mmsize],         m1 ,            1

    vextracti32x8  ym2,        m4,             1
    paddq          ym4,        ym2
    vextracti32x4  xm2,        m4,             1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm5
    paddq          xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET
;static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t psyScale, uint32_t blkPos)
;{
;    const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
;    const int scaleBits = SCALE_BITS - 2 * transformShift;
;    const uint32_t trSize = 1 << log2TrSize;
;    int max = X265_MAX(0, (2 * transformShift + 1));
;
;    for (int y = 0; y < MLS_CG_SIZE; y++)
;    {
;        for (int x = 0; x < MLS_CG_SIZE; x++)
;        {
;            int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
;            int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
;
;            costUncoded[blkPos + x] = static_cast<int64_t>((double)(signCoef * signCoef) << scaleBits);
;
;            /* when no residual coefficient is coded, predicted coef == recon coef */
;            costUncoded[blkPos + x] -= static_cast<int64_t>((psyScale * (predictedCoef)) >> max);
;
;            *totalUncodedCost += costUncoded[blkPos + x];
;            *totalRdCost += costUncoded[blkPos + x];
;        }
;        blkPos += trSize;
;    }
;}

;---------------------------------------------------------------------------------------------------------------------------------------------------------
; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
;---------------------------------------------------------------------------------------------------------------------------------------------------------
INIT_ZMM avx512
cglobal psyRdoQuant4, 5, 9, 13
%if WIN64
    mov             r5,        r5m
%endif
    mov            r6d,        r6m
    vpbroadcastq   m12,        [r5]                              ; psyScale
    lea             r0,        [r0 + 2 * r6]
    lea             r1,        [r1 + 2 * r6]
    lea             r6,        [4 * r6]
    lea             r2,        [r2 + 2 * r6]
    movq           xm0,        [r3]
    movq           xm1,        [r4]

%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8]
%else
    %error Unsupported BIT_DEPTH!
%endif

    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3

;Row 1, 2
    vpmovsxwq       m6,        [r0]
    vpmovsxwq       m7,        [r1]
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,            m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_4                       ;(psyScale * predictedCoef) >> max

    psubq           m8,        m9
    paddq           m4,        m8
    movu           [r2],       m8

    ;Row 3, 4
    vpmovsxwq       m6,        [r0 + 16]
    vpmovsxwq       m7,        [r1 + 16]
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,             m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_4                      ;(psyScale * predictedCoef) >> max

    psubq           m8,         m9
    paddq           m4,         m8
    movu           [r2 + 64],   m8

    vextracti32x8  ym2,         m4,            1
    paddq          ym4,        ym2
    vextracti32x4  xm2,         m4,            1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm3
    paddq          xm4,        xm2

    paddq          xm0,        xm4
    paddq          xm1,        xm4

    movq           [r3],       xm0
    movq           [r4],       xm1
    RET

;---------------------------------------------------------------------------------------------------------------------------------------------------------
; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
;---------------------------------------------------------------------------------------------------------------------------------------------------------
INIT_ZMM avx512
cglobal psyRdoQuant8, 5, 9, 15
%if WIN64
    mov             r5,        r5m
%endif
    mov            r6d,        r6m
    vpbroadcastq   m12,        [r5]                              ; psyScale
    lea             r0,        [r0 + 2 * r6]
    lea             r1,        [r1 + 2 * r6]
    lea             r6,        [4 * r6]
    lea             r2,        [r2 + 2 * r6]
    movq           xm0,        [r3]
    movq           xm1,        [r4]

%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12 + 8]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10 + 8]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8 + 8]
%else
    %error Unsupported BIT_DEPTH!
%endif

    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3

;Row 1, 2
    movq           xm13,       [r0]
    movq           xm14,       [r1]
    pinsrq         xm13,       [r0 + mmsize/4], 1
    pinsrq         xm14,       [r1 + mmsize/4], 1
    vpmovsxwq       m6,        xm13
    vpmovsxwq       m7,        xm14
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,            m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_8                       ;(psyScale * predictedCoef) >> max

    psubq           m8,        m9
    paddq           m4,        m8
    movu           [r2],       ym8
    vextracti32x8  [r2 + mmsize],  m8 ,        1

    ;Row 3, 4
    movq           xm13,       [r0 + mmsize/2]
    movq           xm14,       [r1 + mmsize/2]
    pinsrq         xm13,       [r0 + 3 * mmsize/4],      1
    pinsrq         xm14,       [r1 + 3 * mmsize/4],      1
    vpmovsxwq       m6,        xm13
    vpmovsxwq       m7,        xm14
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,             m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_8                      ;(psyScale * predictedCoef) >> max

    psubq           m8,         m9
    paddq           m4,         m8
    movu           [r2 + 2 * mmsize],       ym8
    vextracti32x8  [r2 + 3 * mmsize],  m8 ,    1

    vextracti32x8  ym2,         m4,            1
    paddq          ym4,        ym2
    vextracti32x4  xm2,         m4,            1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm3
    paddq          xm4,        xm2

    paddq          xm0,        xm4
    paddq          xm1,        xm4

    movq           [r3],       xm0
    movq           [r4],       xm1
    RET

;---------------------------------------------------------------------------------------------------------------------------------------------------------
; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
;---------------------------------------------------------------------------------------------------------------------------------------------------------
INIT_ZMM avx512
cglobal psyRdoQuant16, 5, 9, 15
%if WIN64
    mov             r5,        r5m
%endif
    mov            r6d,        r6m
    vpbroadcastq   m12,        [r5]                              ; psyScale
    lea             r0,        [r0 + 2 * r6]
    lea             r1,        [r1 + 2 * r6]
    lea             r6,        [4 * r6]
    lea             r2,        [r2 + 2 * r6]
    movq           xm0,        [r3]
    movq           xm1,        [r4]

%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10 + 16]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8 + 16]
%else
    %error Unsupported BIT_DEPTH!
%endif

    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3

;Row 1, 2
    movq           xm13,       [r0]
    movq           xm14,       [r1]
    pinsrq         xm13,       [r0 + mmsize/2], 1
    pinsrq         xm14,       [r1 + mmsize/2], 1
    vpmovsxwq       m6,        xm13
    vpmovsxwq       m7,        xm14
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,            m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max

    psubq           m8,        m9
    paddq           m4,        m8
    movu           [r2],       ym8
    vextracti32x8  [r2 + 2 * mmsize],  m8 ,        1

    ;Row 3, 4
    movq           xm13,       [r0 + mmsize]
    movq           xm14,       [r1 + mmsize]
    pinsrq         xm13,       [r0 + 3 * mmsize/2],      1
    pinsrq         xm14,       [r1 + 3 * mmsize/2],      1
    vpmovsxwq       m6,        xm13
    vpmovsxwq       m7,        xm14
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,             m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_16                      ;(psyScale * predictedCoef) >> max

    psubq           m8,         m9
    paddq           m4,         m8
    movu           [r2 + 4 * mmsize],       ym8
    vextracti32x8  [r2 + 6 * mmsize],  m8 ,    1

    vextracti32x8  ym2,         m4,            1
    paddq          ym4,        ym2
    vextracti32x4  xm2,         m4,            1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm3
    paddq          xm4,        xm2

    paddq          xm0,        xm4
    paddq          xm1,        xm4

    movq           [r3],       xm0
    movq           [r4],       xm1
    RET

;---------------------------------------------------------------------------------------------------------------------------------------------------------
; void psyRdoQuant(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
;---------------------------------------------------------------------------------------------------------------------------------------------------------
INIT_ZMM avx512
cglobal psyRdoQuant32, 5, 9, 15
%if WIN64
    mov             r5,        r5m
%endif
    mov            r6d,        r6m
    vpbroadcastq   m12,        [r5]                              ; psyScale
    lea             r0,        [r0 + 2 * r6]
    lea             r1,        [r1 + 2 * r6]
    lea             r6,        [4 * r6]
    lea             r2,        [r2 + 2 * r6]
    movq           xm0,        [r3]
    movq           xm1,        [r4]

%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10 + 24]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8 + 24]
%else
    %error Unsupported BIT_DEPTH!
%endif

    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3

;Row 1, 2
    movq           xm13,       [r0]
    movq           xm14,       [r1]
    pinsrq         xm13,       [r0 + mmsize], 1
    pinsrq         xm14,       [r1 + mmsize], 1
    vpmovsxwq       m6,        xm13
    vpmovsxwq       m7,        xm14
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,            m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max

    psubq           m8,        m9
    paddq           m4,        m8
    movu           [r2],       ym8
    vextracti32x8  [r2 + 4 * mmsize],  m8 ,        1

    ;Row 3, 4
    movq           xm13,       [r0 + 2 * mmsize]
    movq           xm14,       [r1 + 2 * mmsize]
    pinsrq         xm13,       [r0 + 3 * mmsize],      1
    pinsrq         xm14,       [r1 + 3 * mmsize],      1
    vpmovsxwq       m6,        xm13
    vpmovsxwq       m7,        xm14
    psubq           m7,        m6                              ; predictedCoef

    vcvtqq2pd       m9,        m6
    vfmadd213pd     m9,        m9,             m3
    vcvtpd2qq       m8,        m9
    vpsllq          m8,        xm2                             ;(signCoef * signCoef) << scaleBits

    vcvtqq2pd      m10,        m7
    vcvtqq2pd      m11,        m12
    vfmadd213pd    m10,        m11,             m3
    vcvtpd2qq       m9,        m10
    vpsraq          m9,        RDO_MAX_32                      ;(psyScale * predictedCoef) >> max

    psubq           m8,         m9
    paddq           m4,         m8
    movu           [r2 + 8 * mmsize],       ym8
    vextracti32x8  [r2 + 12 * mmsize], m8 ,    1

    vextracti32x8  ym2,         m4,            1
    paddq          ym4,        ym2
    vextracti32x4  xm2,         m4,            1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm3
    paddq          xm4,        xm2

    paddq          xm0,        xm4
    paddq          xm1,        xm4

    movq           [r3],       xm0
    movq           [r4],       xm1
    RET

INIT_YMM avx2
cglobal nonPsyRdoQuant4, 5, 9, 16
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
    movq           xm0,        [r2]
    movq           xm1,        [r3]

%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8]
%else
    %error Unsupported BIT_DEPTH!
%endif
    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3
    vpxor           m13,       m13

    vpmovsxwd                  m6,        [r0]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1],       m13

    vpmovsxwd                 m6,        [r0 + 8]
    vcvtdq2pd                 m9,        xm6
    vfmadd213pd               m9,        m9,             m3
    vcvtpd2dq                 xm8,       m9
    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                     m4,        m13
    movu                      [r1 + 32], m13

    vpmovsxwd                 m6,        [r0 + 16]
    vcvtdq2pd                 m9,        xm6
    vfmadd213pd               m9,        m9,             m3
    vcvtpd2dq                 xm8,       m9
    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                     m4,        m13
    movu                      [r1 + 64], m13

    vpmovsxwd                 m6,        [r0 +24]
    vcvtdq2pd                 m9,        xm6
    vfmadd213pd               m9,        m9,             m3
    vcvtpd2dq                 xm8,       m9
    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int 
    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
    paddq                     m4,        m13
    movu                      [r1 + 96], m13


    vextracti128              xm2,       m4,            1
    paddq                     xm4,       xm2
    punpckhqdq                xm2,       xm4,            xm3
    paddq                     xm4,       xm2

    paddq                     xm0,       xm4
    paddq                     xm1,       xm4

    movq                      [r2],      xm0
    movq                      [r3],      xm1
    RET



INIT_YMM avx2
cglobal nonPsyRdoQuant8, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12 + 8]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10 + 8]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8 + 8]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5
    movq           xm0,        [r0]
    vpmovsxwd       m1,         xm0
    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq       xm1,        m2
    vpmovsxdq       m0 ,        xm1
    vpsllq          m0,         xm3                              ; costUncoded
    paddq           m4,         m0
    movu            [r1],       ym0
    vpxor           m0,         m0
    movq            xm0,        [r0 +mmsize/2]
    vpmovsxwd       m1,         xm0
    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq       xm1,        m2
    vpmovsxdq       m0 ,       xm1
    vpsllq          m0,         xm3                              ; costUncoded
    paddq           m4,         m0
    movu            [r1 +2*mmsize],       m0
    vpxor           m0,         m0
    movq            xm0,        [r0 +mmsize]
    vpmovsxwd       m1,         xm0
    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq       xm1,        m2
    vpmovsxdq       m0 ,       xm1
    vpsllq          m0,         xm3                              ; costUncoded
    paddq           m4,         m0
    movu            [r1 +4*mmsize],       m0
    vpxor           m0,         m0
    movq            xm0,        [r0 +3*mmsize/2]
    vpmovsxwd       m1,         xm0
    vcvtdq2pd       m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd     m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq       xm1,        m2
    vpmovsxdq       m0 ,       xm1
    vpsllq          m0,         xm3                              ; costUncoded
    paddq           m4,         m0
    movu            [r1 +6*mmsize],       m0

    vextracti128    xm2,        m4,             1
    paddq           xm4,        xm2
    punpckhqdq      xm2,        xm4,            xm5
    paddq           xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET
INIT_YMM avx2
cglobal nonPsyRdoQuant16, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12 + 16]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10 + 16]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8 + 16]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5

;Row 1, 2
    movq           xm0,        [r0]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1],       ym0
   
    movq           xm0,        [r0 +mmsize]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1+4*mmsize],       ym0
   
    movq           xm0,        [r0 + 2*mmsize]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1+8*mmsize],       ym0

    movq           xm0,        [r0 + 3*mmsize]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1+12*mmsize],       ym0

 
    vextracti128  xm2,        m4,             1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm5
    paddq          xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET
INIT_YMM avx2
cglobal nonPsyRdoQuant32, 5, 5, 8
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
%if BIT_DEPTH == 12
    mov             r4,        [tab_nonpsyRdo12 + 24]
%elif BIT_DEPTH == 10
    mov             r4,        [tab_nonpsyRdo10 + 24]
%elif BIT_DEPTH == 8
    mov             r4,        [tab_nonpsyRdo8 + 24]
%else
    %error Unsupported BIT_DEPTH!
 %endif
    movq           xm3,        r4
    movq           xm6,        [r2]
    movq           xm7,        [r3]
    vpxor           m4,        m4
    vpxor           m5,        m5

    movq           xm0,        [r0]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1],       m0
    vpxor           m0,        m0
   
    movq           xm0,        [r0 +2*mmsize]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1 + 8*mmsize],       m0
    vpxor           m0,        m0
    
    movq           xm0,        [r0 +4*mmsize]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1 +16*mmsize],       m0
    vpxor           m0,        m0

    movq           xm0,        [r0 +6*mmsize]
    vpmovsxwd      m1,         xm0
    vcvtdq2pd      m2,         xm1                              ; Convert packed 64-bit integers to packed double-precision (64-bit) floating-point elements
    vfmadd213pd    m2,         m2,             m5              ; Multiply packed double-precision (64-bit) floating-point elements
    vcvtpd2dq      xm1,         m2
    vpmovsxdq      m0 ,       xm1
    vpsllq         m0,         xm3                              ; costUncoded
    paddq          m4,         m0
    movu           [r1 +24*mmsize],       m0

    vextracti128   xm2,        m4,             1
    paddq          xm4,        xm2
    punpckhqdq     xm2,        xm4,            xm5
    paddq          xm4,        xm2

    paddq          xm6,        xm4
    paddq          xm7,        xm4

    movq           [r2],       xm6
    movq           [r3],       xm7
    RET

INIT_YMM avx2
cglobal psyRdoQuant_1p4, 5, 9, 16
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
    movq           xm0,        [r2]
    movq           xm1,        [r3]

%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8]
%else
    %error Unsupported BIT_DEPTH!
%endif
    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3
    vpxor           m13,       m13

    vpmovsxwd                  m6,        [r0]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1],       m13

    vpmovsxwd                 m6,        [r0 + 8]
    vcvtdq2pd                 m9,        xm6
    vfmadd213pd               m9,        m9,             m3
    vcvtpd2dq                 xm8,       m9
    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                     m4,        m13
    movu                      [r1 + 32], m13

    vpmovsxwd                 m6,        [r0 + 16]
    vcvtdq2pd                 m9,        xm6
    vfmadd213pd               m9,        m9,             m3
    vcvtpd2dq                 xm8,       m9
    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                     m4,        m13
    movu                      [r1 + 64], m13

    vpmovsxwd                 m6,        [r0 +24]
    vcvtdq2pd                 m9,        xm6
    vfmadd213pd               m9,        m9,             m3
    vcvtpd2dq                 xm8,       m9
    vpmovsxdq                 m13,       xm8                              ; 32 bit int to 64 bit int 
    vpsllq                    m13,       xm2                             ;(signCoef * signCoef) << scaleBits
    paddq                     m4,        m13
    movu                      [r1 + 96], m13


    vextracti128              xm2,       m4,            1
    paddq                     xm4,       xm2
    punpckhqdq                xm2,       xm4,            xm3
    paddq                     xm4,       xm2

    paddq                     xm0,       xm4
    paddq                     xm1,       xm4

    movq                      [r2],      xm0
    movq                      [r3],      xm1
    RET
INIT_YMM avx2
cglobal psyRdoQuant_1p8, 7, 9, 16
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
    movq           xm0,        [r2]
    movq           xm1,        [r3]
%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12 +8]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10 +8]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8 + 8 ]
%else
    %error Unsupported BIT_DEPTH!
%endif
    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3
    vpxor           m13,       m13


    vpmovsxwd                  m6,        [r0]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1],       m13

    vpmovsxwd                  m6,        [r0 + 16]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 64],       m13

    vpmovsxwd                  m6,        [r0 +32]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 +128],       m13

    vpmovsxwd                  m6,        [r0 + 48]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 192],       m13

    vextracti128              xm2,       m4,            1
    paddq                     xm4,       xm2
    punpckhqdq                xm2,       xm4,            xm3
    paddq                     xm4,       xm2

    paddq                     xm0,       xm4
    paddq                     xm1,       xm4

    movq                      [r2],      xm0
    movq                      [r3],      xm1
    RET

INIT_YMM avx2
cglobal psyRdoQuant_1p16, 7, 9, 16
    mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
    movq           xm0,        [r2]
    movq           xm1,        [r3]
%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12 + 16]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10 + 16]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8 + 16 ]
%else
    %error Unsupported BIT_DEPTH!
%endif
    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3
    vpxor           m13,       m13

    vpmovsxwd                  m6,        [r0]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1],       m13

    vpmovsxwd                  m6,        [r0 + mmsize]

    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 4*mmsize],       m13

    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 8*mmsize],       m13

    vpmovsxwd                  m6,        [r0 + 3 * mmsize]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 12*mmsize],       m13

    vextracti128              xm2,       m4,            1
    paddq                     xm4,       xm2
    punpckhqdq                xm2,       xm4,            xm3
    paddq                     xm4,       xm2

    paddq                     xm0,       xm4
    paddq                     xm1,       xm4

    movq                      [r2],      xm0
    movq                      [r3],      xm1
    RET

INIT_YMM avx2
cglobal psyRdoQuant_1p32, 7, 9, 16
   mov            r4d,        r4m
    lea             r0,        [r0 + 2 * r4]
    lea             r4,        [4 * r4]
    lea             r1,        [r1 + 2 * r4]
    movq           xm0,        [r2]
    movq           xm1,        [r3]
%if BIT_DEPTH == 12
    mov            r5,         [tab_nonpsyRdo12 + 24]                 ; scaleBits
%elif BIT_DEPTH == 10
    mov            r5,         [tab_nonpsyRdo10 + 24]
%elif BIT_DEPTH == 8
    mov            r5,         [tab_nonpsyRdo8 + 24]
%else
    %error Unsupported BIT_DEPTH!
%endif
    movq           xm2,        r5
    vpxor           m4,        m4
    vpxor           m3,        m3
    vpxor           m13,       m13


    vpmovsxwd                  m6,        [r0]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1],       m13

    vpmovsxwd                  m6,        [r0 + 2 * mmsize]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 8 * mmsize],       m13

    vpmovsxwd                  m6,        [r0 + 4 * mmsize]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1 + 16 * mmsize],       m13

    vpmovsxwd                  m6,        [r0 + 6 * mmsize]
    vcvtdq2pd                  m9,        xm6
    vfmadd213pd                m9,        m9,             m3
    vcvtpd2dq                  xm8,       m9
    vpmovsxdq                  m13,       xm8                              ; 32 bit int to 64 bit int
    vpsllq                     m13,       xm2                             ;(signCoef * signCoef) << scaleBits 
    paddq                      m4,        m13
    movu                       [r1  + 24 *mmsize],       m13

    vextracti128              xm2,       m4,            1
    paddq                     xm4,       xm2
    punpckhqdq                xm2,       xm4,            xm3
    paddq                     xm4,       xm2

    paddq                     xm0,       xm4
    paddq                     xm1,       xm4

    movq                      [r2],      xm0
    movq                      [r3],      xm1
    RET

%endif
